30 30 9 3 3 7 4 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 // SPDX-License-Identifier: GPL-2.0-only /* * v4l2-fh.c * * V4L2 file handles. * * Copyright (C) 2009--2010 Nokia Corporation. * * Contact: Sakari Ailus <sakari.ailus@iki.fi> */ #include <linux/bitops.h> #include <linux/slab.h> #include <linux/export.h> #include <media/v4l2-dev.h> #include <media/v4l2-fh.h> #include <media/v4l2-event.h> #include <media/v4l2-ioctl.h> #include <media/v4l2-mc.h> void v4l2_fh_init(struct v4l2_fh *fh, struct video_device *vdev) { fh->vdev = vdev; /* Inherit from video_device. May be overridden by the driver. */ fh->ctrl_handler = vdev->ctrl_handler; INIT_LIST_HEAD(&fh->list); set_bit(V4L2_FL_USES_V4L2_FH, &fh->vdev->flags); /* * determine_valid_ioctls() does not know if struct v4l2_fh * is used by this driver, but here we do. So enable the * prio ioctls here. */ set_bit(_IOC_NR(VIDIOC_G_PRIORITY), vdev->valid_ioctls); set_bit(_IOC_NR(VIDIOC_S_PRIORITY), vdev->valid_ioctls); fh->prio = V4L2_PRIORITY_UNSET; init_waitqueue_head(&fh->wait); INIT_LIST_HEAD(&fh->available); INIT_LIST_HEAD(&fh->subscribed); fh->sequence = -1; mutex_init(&fh->subscribe_lock); } EXPORT_SYMBOL_GPL(v4l2_fh_init); void v4l2_fh_add(struct v4l2_fh *fh) { unsigned long flags; v4l2_prio_open(fh->vdev->prio, &fh->prio); spin_lock_irqsave(&fh->vdev->fh_lock, flags); list_add(&fh->list, &fh->vdev->fh_list); spin_unlock_irqrestore(&fh->vdev->fh_lock, flags); } EXPORT_SYMBOL_GPL(v4l2_fh_add); int v4l2_fh_open(struct file *filp) { struct video_device *vdev = video_devdata(filp); struct v4l2_fh *fh = kzalloc(sizeof(*fh), GFP_KERNEL); filp->private_data = fh; if (fh == NULL) return -ENOMEM; v4l2_fh_init(fh, vdev); v4l2_fh_add(fh); return 0; } EXPORT_SYMBOL_GPL(v4l2_fh_open); void v4l2_fh_del(struct v4l2_fh *fh) { unsigned long flags; spin_lock_irqsave(&fh->vdev->fh_lock, flags); list_del_init(&fh->list); spin_unlock_irqrestore(&fh->vdev->fh_lock, flags); v4l2_prio_close(fh->vdev->prio, fh->prio); } EXPORT_SYMBOL_GPL(v4l2_fh_del); void v4l2_fh_exit(struct v4l2_fh *fh) { if (fh->vdev == NULL) return; v4l_disable_media_source(fh->vdev); v4l2_event_unsubscribe_all(fh); mutex_destroy(&fh->subscribe_lock); fh->vdev = NULL; } EXPORT_SYMBOL_GPL(v4l2_fh_exit); int v4l2_fh_release(struct file *filp) { struct v4l2_fh *fh = filp->private_data; if (fh) { v4l2_fh_del(fh); v4l2_fh_exit(fh); kfree(fh); filp->private_data = NULL; } return 0; } EXPORT_SYMBOL_GPL(v4l2_fh_release); int v4l2_fh_is_singular(struct v4l2_fh *fh) { unsigned long flags; int is_singular; if (fh == NULL || fh->vdev == NULL) return 0; spin_lock_irqsave(&fh->vdev->fh_lock, flags); is_singular = list_is_singular(&fh->list); spin_unlock_irqrestore(&fh->vdev->fh_lock, flags); return is_singular; } EXPORT_SYMBOL_GPL(v4l2_fh_is_singular);
103 103 97 12 149 106 26 25 3 21 23 21 26 9 9 9 1 2 8 26 26 26 150 138 26 52 8 28 10 20 20 15 3 12 65 55 26 26 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 /* * net/tipc/msg.c: TIPC message header routines * * Copyright (c) 2000-2006, 2014-2015, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <net/sock.h> #include "core.h" #include "msg.h" #include "addr.h" #include "name_table.h" #include "crypto.h" #define BUF_ALIGN(x) ALIGN(x, 4) #define MAX_FORWARD_SIZE 1024 #ifdef CONFIG_TIPC_CRYPTO #define BUF_HEADROOM ALIGN(((LL_MAX_HEADER + 48) + EHDR_MAX_SIZE), 16) #define BUF_OVERHEAD (BUF_HEADROOM + TIPC_AES_GCM_TAG_SIZE) #else #define BUF_HEADROOM (LL_MAX_HEADER + 48) #define BUF_OVERHEAD BUF_HEADROOM #endif const int one_page_mtu = PAGE_SIZE - SKB_DATA_ALIGN(BUF_OVERHEAD) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); /** * tipc_buf_acquire - creates a TIPC message buffer * @size: message size (including TIPC header) * @gfp: memory allocation flags * * Return: a new buffer with data pointers set to the specified size. * * NOTE: * Headroom is reserved to allow prepending of a data link header. * There may also be unrequested tailroom present at the buffer's end. */ struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp) { struct sk_buff *skb; skb = alloc_skb_fclone(BUF_OVERHEAD + size, gfp); if (skb) { skb_reserve(skb, BUF_HEADROOM); skb_put(skb, size); skb->next = NULL; } return skb; } void tipc_msg_init(u32 own_node, struct tipc_msg *m, u32 user, u32 type, u32 hsize, u32 dnode) { memset(m, 0, hsize); msg_set_version(m); msg_set_user(m, user); msg_set_hdr_sz(m, hsize); msg_set_size(m, hsize); msg_set_prevnode(m, own_node); msg_set_type(m, type); if (hsize > SHORT_H_SIZE) { msg_set_orignode(m, own_node); msg_set_destnode(m, dnode); } } struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, uint data_sz, u32 dnode, u32 onode, u32 dport, u32 oport, int errcode) { struct tipc_msg *msg; struct sk_buff *buf; buf = tipc_buf_acquire(hdr_sz + data_sz, GFP_ATOMIC); if (unlikely(!buf)) return NULL; msg = buf_msg(buf); tipc_msg_init(onode, msg, user, type, hdr_sz, dnode); msg_set_size(msg, hdr_sz + data_sz); msg_set_origport(msg, oport); msg_set_destport(msg, dport); msg_set_errcode(msg, errcode); return buf; } /* tipc_buf_append(): Append a buffer to the fragment list of another buffer * @*headbuf: in: NULL for first frag, otherwise value returned from prev call * out: set when successful non-complete reassembly, otherwise NULL * @*buf: in: the buffer to append. Always defined * out: head buf after successful complete reassembly, otherwise NULL * Returns 1 when reassembly complete, otherwise 0 */ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) { struct sk_buff *head = *headbuf; struct sk_buff *frag = *buf; struct sk_buff *tail = NULL; struct tipc_msg *msg; u32 fragid; int delta; bool headstolen; if (!frag) goto err; msg = buf_msg(frag); fragid = msg_type(msg); frag->next = NULL; skb_pull(frag, msg_hdr_sz(msg)); if (fragid == FIRST_FRAGMENT) { if (unlikely(head)) goto err; *buf = NULL; if (skb_has_frag_list(frag) && __skb_linearize(frag)) goto err; frag = skb_unshare(frag, GFP_ATOMIC); if (unlikely(!frag)) goto err; head = *headbuf = frag; TIPC_SKB_CB(head)->tail = NULL; return 0; } if (!head) goto err; if (skb_try_coalesce(head, frag, &headstolen, &delta)) { kfree_skb_partial(frag, headstolen); } else { tail = TIPC_SKB_CB(head)->tail; if (!skb_has_frag_list(head)) skb_shinfo(head)->frag_list = frag; else tail->next = frag; head->truesize += frag->truesize; head->data_len += frag->len; head->len += frag->len; TIPC_SKB_CB(head)->tail = frag; } if (fragid == LAST_FRAGMENT) { TIPC_SKB_CB(head)->validated = 0; if (unlikely(!tipc_msg_validate(&head))) goto err; *buf = head; TIPC_SKB_CB(head)->tail = NULL; *headbuf = NULL; return 1; } *buf = NULL; return 0; err: kfree_skb(*buf); kfree_skb(*headbuf); *buf = *headbuf = NULL; return 0; } /** * tipc_msg_append(): Append data to tail of an existing buffer queue * @_hdr: header to be used * @m: the data to be appended * @mss: max allowable size of buffer * @dlen: size of data to be appended * @txq: queue to append to * * Return: the number of 1k blocks appended or errno value */ int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen, int mss, struct sk_buff_head *txq) { struct sk_buff *skb; int accounted, total, curr; int mlen, cpy, rem = dlen; struct tipc_msg *hdr; skb = skb_peek_tail(txq); accounted = skb ? msg_blocks(buf_msg(skb)) : 0; total = accounted; do { if (!skb || skb->len >= mss) { skb = tipc_buf_acquire(mss, GFP_KERNEL); if (unlikely(!skb)) return -ENOMEM; skb_orphan(skb); skb_trim(skb, MIN_H_SIZE); hdr = buf_msg(skb); skb_copy_to_linear_data(skb, _hdr, MIN_H_SIZE); msg_set_hdr_sz(hdr, MIN_H_SIZE); msg_set_size(hdr, MIN_H_SIZE); __skb_queue_tail(txq, skb); total += 1; } hdr = buf_msg(skb); curr = msg_blocks(hdr); mlen = msg_size(hdr); cpy = min_t(size_t, rem, mss - mlen); if (cpy != copy_from_iter(skb->data + mlen, cpy, &m->msg_iter)) return -EFAULT; msg_set_size(hdr, mlen + cpy); skb_put(skb, cpy); rem -= cpy; total += msg_blocks(hdr) - curr; } while (rem > 0); return total - accounted; } /* tipc_msg_validate - validate basic format of received message * * This routine ensures a TIPC message has an acceptable header, and at least * as much data as the header indicates it should. The routine also ensures * that the entire message header is stored in the main fragment of the message * buffer, to simplify future access to message header fields. * * Note: Having extra info present in the message header or data areas is OK. * TIPC will ignore the excess, under the assumption that it is optional info * introduced by a later release of the protocol. */ bool tipc_msg_validate(struct sk_buff **_skb) { struct sk_buff *skb = *_skb; struct tipc_msg *hdr; int msz, hsz; /* Ensure that flow control ratio condition is satisfied */ if (unlikely(skb->truesize / buf_roundup_len(skb) >= 4)) { skb = skb_copy_expand(skb, BUF_HEADROOM, 0, GFP_ATOMIC); if (!skb) return false; kfree_skb(*_skb); *_skb = skb; } if (unlikely(TIPC_SKB_CB(skb)->validated)) return true; if (unlikely(!pskb_may_pull(skb, MIN_H_SIZE))) return false; hsz = msg_hdr_sz(buf_msg(skb)); if (unlikely(hsz < MIN_H_SIZE) || (hsz > MAX_H_SIZE)) return false; if (unlikely(!pskb_may_pull(skb, hsz))) return false; hdr = buf_msg(skb); if (unlikely(msg_version(hdr) != TIPC_VERSION)) return false; msz = msg_size(hdr); if (unlikely(msz < hsz)) return false; if (unlikely((msz - hsz) > TIPC_MAX_USER_MSG_SIZE)) return false; if (unlikely(skb->len < msz)) return false; TIPC_SKB_CB(skb)->validated = 1; return true; } /** * tipc_msg_fragment - build a fragment skb list for TIPC message * * @skb: TIPC message skb * @hdr: internal msg header to be put on the top of the fragments * @pktmax: max size of a fragment incl. the header * @frags: returned fragment skb list * * Return: 0 if the fragmentation is successful, otherwise: -EINVAL * or -ENOMEM */ int tipc_msg_fragment(struct sk_buff *skb, const struct tipc_msg *hdr, int pktmax, struct sk_buff_head *frags) { int pktno, nof_fragms, dsz, dmax, eat; struct tipc_msg *_hdr; struct sk_buff *_skb; u8 *data; /* Non-linear buffer? */ if (skb_linearize(skb)) return -ENOMEM; data = (u8 *)skb->data; dsz = msg_size(buf_msg(skb)); dmax = pktmax - INT_H_SIZE; if (dsz <= dmax || !dmax) return -EINVAL; nof_fragms = dsz / dmax + 1; for (pktno = 1; pktno <= nof_fragms; pktno++) { if (pktno < nof_fragms) eat = dmax; else eat = dsz % dmax; /* Allocate a new fragment */ _skb = tipc_buf_acquire(INT_H_SIZE + eat, GFP_ATOMIC); if (!_skb) goto error; skb_orphan(_skb); __skb_queue_tail(frags, _skb); /* Copy header & data to the fragment */ skb_copy_to_linear_data(_skb, hdr, INT_H_SIZE); skb_copy_to_linear_data_offset(_skb, INT_H_SIZE, data, eat); data += eat; /* Update the fragment's header */ _hdr = buf_msg(_skb); msg_set_fragm_no(_hdr, pktno); msg_set_nof_fragms(_hdr, nof_fragms); msg_set_size(_hdr, INT_H_SIZE + eat); } return 0; error: __skb_queue_purge(frags); __skb_queue_head_init(frags); return -ENOMEM; } /** * tipc_msg_build - create buffer chain containing specified header and data * @mhdr: Message header, to be prepended to data * @m: User message * @offset: buffer offset for fragmented messages (FIXME) * @dsz: Total length of user data * @pktmax: Max packet size that can be used * @list: Buffer or chain of buffers to be returned to caller * * Note that the recursive call we are making here is safe, since it can * logically go only one further level down. * * Return: message data size or errno: -ENOMEM, -EFAULT */ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, int dsz, int pktmax, struct sk_buff_head *list) { int mhsz = msg_hdr_sz(mhdr); struct tipc_msg pkthdr; int msz = mhsz + dsz; int pktrem = pktmax; struct sk_buff *skb; int drem = dsz; int pktno = 1; char *pktpos; int pktsz; int rc; msg_set_size(mhdr, msz); /* No fragmentation needed? */ if (likely(msz <= pktmax)) { skb = tipc_buf_acquire(msz, GFP_KERNEL); /* Fall back to smaller MTU if node local message */ if (unlikely(!skb)) { if (pktmax != MAX_MSG_SIZE) return -ENOMEM; rc = tipc_msg_build(mhdr, m, offset, dsz, one_page_mtu, list); if (rc != dsz) return rc; if (tipc_msg_assemble(list)) return dsz; return -ENOMEM; } skb_orphan(skb); __skb_queue_tail(list, skb); skb_copy_to_linear_data(skb, mhdr, mhsz); pktpos = skb->data + mhsz; if (copy_from_iter_full(pktpos, dsz, &m->msg_iter)) return dsz; rc = -EFAULT; goto error; } /* Prepare reusable fragment header */ tipc_msg_init(msg_prevnode(mhdr), &pkthdr, MSG_FRAGMENTER, FIRST_FRAGMENT, INT_H_SIZE, msg_destnode(mhdr)); msg_set_size(&pkthdr, pktmax); msg_set_fragm_no(&pkthdr, pktno); msg_set_importance(&pkthdr, msg_importance(mhdr)); /* Prepare first fragment */ skb = tipc_buf_acquire(pktmax, GFP_KERNEL); if (!skb) return -ENOMEM; skb_orphan(skb); __skb_queue_tail(list, skb); pktpos = skb->data; skb_copy_to_linear_data(skb, &pkthdr, INT_H_SIZE); pktpos += INT_H_SIZE; pktrem -= INT_H_SIZE; skb_copy_to_linear_data_offset(skb, INT_H_SIZE, mhdr, mhsz); pktpos += mhsz; pktrem -= mhsz; do { if (drem < pktrem) pktrem = drem; if (!copy_from_iter_full(pktpos, pktrem, &m->msg_iter)) { rc = -EFAULT; goto error; } drem -= pktrem; if (!drem) break; /* Prepare new fragment: */ if (drem < (pktmax - INT_H_SIZE)) pktsz = drem + INT_H_SIZE; else pktsz = pktmax; skb = tipc_buf_acquire(pktsz, GFP_KERNEL); if (!skb) { rc = -ENOMEM; goto error; } skb_orphan(skb); __skb_queue_tail(list, skb); msg_set_type(&pkthdr, FRAGMENT); msg_set_size(&pkthdr, pktsz); msg_set_fragm_no(&pkthdr, ++pktno); skb_copy_to_linear_data(skb, &pkthdr, INT_H_SIZE); pktpos = skb->data + INT_H_SIZE; pktrem = pktsz - INT_H_SIZE; } while (1); msg_set_type(buf_msg(skb), LAST_FRAGMENT); return dsz; error: __skb_queue_purge(list); __skb_queue_head_init(list); return rc; } /** * tipc_msg_bundle - Append contents of a buffer to tail of an existing one * @bskb: the bundle buffer to append to * @msg: message to be appended * @max: max allowable size for the bundle buffer * * Return: "true" if bundling has been performed, otherwise "false" */ static bool tipc_msg_bundle(struct sk_buff *bskb, struct tipc_msg *msg, u32 max) { struct tipc_msg *bmsg = buf_msg(bskb); u32 msz, bsz, offset, pad; msz = msg_size(msg); bsz = msg_size(bmsg); offset = BUF_ALIGN(bsz); pad = offset - bsz; if (unlikely(skb_tailroom(bskb) < (pad + msz))) return false; if (unlikely(max < (offset + msz))) return false; skb_put(bskb, pad + msz); skb_copy_to_linear_data_offset(bskb, offset, msg, msz); msg_set_size(bmsg, offset + msz); msg_set_msgcnt(bmsg, msg_msgcnt(bmsg) + 1); return true; } /** * tipc_msg_try_bundle - Try to bundle a new message to the last one * @tskb: the last/target message to which the new one will be appended * @skb: the new message skb pointer * @mss: max message size (header inclusive) * @dnode: destination node for the message * @new_bundle: if this call made a new bundle or not * * Return: "true" if the new message skb is potential for bundling this time or * later, in the case a bundling has been done this time, the skb is consumed * (the skb pointer = NULL). * Otherwise, "false" if the skb cannot be bundled at all. */ bool tipc_msg_try_bundle(struct sk_buff *tskb, struct sk_buff **skb, u32 mss, u32 dnode, bool *new_bundle) { struct tipc_msg *msg, *inner, *outer; u32 tsz; /* First, check if the new buffer is suitable for bundling */ msg = buf_msg(*skb); if (msg_user(msg) == MSG_FRAGMENTER) return false; if (msg_user(msg) == TUNNEL_PROTOCOL) return false; if (msg_user(msg) == BCAST_PROTOCOL) return false; if (mss <= INT_H_SIZE + msg_size(msg)) return false; /* Ok, but the last/target buffer can be empty? */ if (unlikely(!tskb)) return true; /* Is it a bundle already? Try to bundle the new message to it */ if (msg_user(buf_msg(tskb)) == MSG_BUNDLER) { *new_bundle = false; goto bundle; } /* Make a new bundle of the two messages if possible */ tsz = msg_size(buf_msg(tskb)); if (unlikely(mss < BUF_ALIGN(INT_H_SIZE + tsz) + msg_size(msg))) return true; if (unlikely(pskb_expand_head(tskb, INT_H_SIZE, mss - tsz - INT_H_SIZE, GFP_ATOMIC))) return true; inner = buf_msg(tskb); skb_push(tskb, INT_H_SIZE); outer = buf_msg(tskb); tipc_msg_init(msg_prevnode(inner), outer, MSG_BUNDLER, 0, INT_H_SIZE, dnode); msg_set_importance(outer, msg_importance(inner)); msg_set_size(outer, INT_H_SIZE + tsz); msg_set_msgcnt(outer, 1); *new_bundle = true; bundle: if (likely(tipc_msg_bundle(tskb, msg, mss))) { consume_skb(*skb); *skb = NULL; } return true; } /** * tipc_msg_extract(): extract bundled inner packet from buffer * @skb: buffer to be extracted from. * @iskb: extracted inner buffer, to be returned * @pos: position in outer message of msg to be extracted. * Returns position of next msg. * Consumes outer buffer when last packet extracted * Return: true when there is an extracted buffer, otherwise false */ bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos) { struct tipc_msg *hdr, *ihdr; int imsz; *iskb = NULL; if (unlikely(skb_linearize(skb))) goto none; hdr = buf_msg(skb); if (unlikely(*pos > (msg_data_sz(hdr) - MIN_H_SIZE))) goto none; ihdr = (struct tipc_msg *)(msg_data(hdr) + *pos); imsz = msg_size(ihdr); if ((*pos + imsz) > msg_data_sz(hdr)) goto none; *iskb = tipc_buf_acquire(imsz, GFP_ATOMIC); if (!*iskb) goto none; skb_copy_to_linear_data(*iskb, ihdr, imsz); if (unlikely(!tipc_msg_validate(iskb))) goto none; *pos += BUF_ALIGN(imsz); return true; none: kfree_skb(skb); kfree_skb(*iskb); *iskb = NULL; return false; } /** * tipc_msg_reverse(): swap source and destination addresses and add error code * @own_node: originating node id for reversed message * @skb: buffer containing message to be reversed; will be consumed * @err: error code to be set in message, if any * Replaces consumed buffer with new one when successful * Return: true if success, otherwise false */ bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err) { struct sk_buff *_skb = *skb; struct tipc_msg *_hdr, *hdr; int hlen, dlen; if (skb_linearize(_skb)) goto exit; _hdr = buf_msg(_skb); dlen = min_t(uint, msg_data_sz(_hdr), MAX_FORWARD_SIZE); hlen = msg_hdr_sz(_hdr); if (msg_dest_droppable(_hdr)) goto exit; if (msg_errcode(_hdr)) goto exit; /* Never return SHORT header */ if (hlen == SHORT_H_SIZE) hlen = BASIC_H_SIZE; /* Don't return data along with SYN+, - sender has a clone */ if (msg_is_syn(_hdr) && err == TIPC_ERR_OVERLOAD) dlen = 0; /* Allocate new buffer to return */ *skb = tipc_buf_acquire(hlen + dlen, GFP_ATOMIC); if (!*skb) goto exit; memcpy((*skb)->data, _skb->data, msg_hdr_sz(_hdr)); memcpy((*skb)->data + hlen, msg_data(_hdr), dlen); /* Build reverse header in new buffer */ hdr = buf_msg(*skb); msg_set_hdr_sz(hdr, hlen); msg_set_errcode(hdr, err); msg_set_non_seq(hdr, 0); msg_set_origport(hdr, msg_destport(_hdr)); msg_set_destport(hdr, msg_origport(_hdr)); msg_set_destnode(hdr, msg_prevnode(_hdr)); msg_set_prevnode(hdr, own_node); msg_set_orignode(hdr, own_node); msg_set_size(hdr, hlen + dlen); skb_orphan(_skb); kfree_skb(_skb); return true; exit: kfree_skb(_skb); *skb = NULL; return false; } bool tipc_msg_skb_clone(struct sk_buff_head *msg, struct sk_buff_head *cpy) { struct sk_buff *skb, *_skb; skb_queue_walk(msg, skb) { _skb = skb_clone(skb, GFP_ATOMIC); if (!_skb) { __skb_queue_purge(cpy); pr_err_ratelimited("Failed to clone buffer chain\n"); return false; } __skb_queue_tail(cpy, _skb); } return true; } /** * tipc_msg_lookup_dest(): try to find new destination for named message * @net: pointer to associated network namespace * @skb: the buffer containing the message. * @err: error code to be used by caller if lookup fails * Does not consume buffer * Return: true if a destination is found, false otherwise */ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) { struct tipc_msg *msg = buf_msg(skb); u32 scope = msg_lookup_scope(msg); u32 self = tipc_own_addr(net); u32 inst = msg_nameinst(msg); struct tipc_socket_addr sk; struct tipc_uaddr ua; if (!msg_isdata(msg)) return false; if (!msg_named(msg)) return false; if (msg_errcode(msg)) return false; *err = TIPC_ERR_NO_NAME; if (skb_linearize(skb)) return false; msg = buf_msg(skb); if (msg_reroute_cnt(msg)) return false; tipc_uaddr(&ua, TIPC_SERVICE_RANGE, scope, msg_nametype(msg), inst, inst); sk.node = tipc_scope2node(net, scope); if (!tipc_nametbl_lookup_anycast(net, &ua, &sk)) return false; msg_incr_reroute_cnt(msg); if (sk.node != self) msg_set_prevnode(msg, self); msg_set_destnode(msg, sk.node); msg_set_destport(msg, sk.ref); *err = TIPC_OK; return true; } /* tipc_msg_assemble() - assemble chain of fragments into one message */ bool tipc_msg_assemble(struct sk_buff_head *list) { struct sk_buff *skb, *tmp = NULL; if (skb_queue_len(list) == 1) return true; while ((skb = __skb_dequeue(list))) { skb->next = NULL; if (tipc_buf_append(&tmp, &skb)) { __skb_queue_tail(list, skb); return true; } if (!tmp) break; } __skb_queue_purge(list); __skb_queue_head_init(list); pr_warn("Failed do assemble buffer\n"); return false; } /* tipc_msg_reassemble() - clone a buffer chain of fragments and * reassemble the clones into one message */ bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq) { struct sk_buff *skb, *_skb; struct sk_buff *frag = NULL; struct sk_buff *head = NULL; int hdr_len; /* Copy header if single buffer */ if (skb_queue_len(list) == 1) { skb = skb_peek(list); hdr_len = skb_headroom(skb) + msg_hdr_sz(buf_msg(skb)); _skb = __pskb_copy(skb, hdr_len, GFP_ATOMIC); if (!_skb) return false; __skb_queue_tail(rcvq, _skb); return true; } /* Clone all fragments and reassemble */ skb_queue_walk(list, skb) { frag = skb_clone(skb, GFP_ATOMIC); if (!frag) goto error; frag->next = NULL; if (tipc_buf_append(&head, &frag)) break; if (!head) goto error; } __skb_queue_tail(rcvq, frag); return true; error: pr_warn("Failed do clone local mcast rcv buffer\n"); kfree_skb(head); return false; } bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg, struct sk_buff_head *cpy) { struct sk_buff *skb, *_skb; skb_queue_walk(msg, skb) { _skb = pskb_copy(skb, GFP_ATOMIC); if (!_skb) { __skb_queue_purge(cpy); return false; } msg_set_destnode(buf_msg(_skb), dst); __skb_queue_tail(cpy, _skb); } return true; } /* tipc_skb_queue_sorted(); sort pkt into list according to sequence number * @list: list to be appended to * @seqno: sequence number of buffer to add * @skb: buffer to add */ bool __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, struct sk_buff *skb) { struct sk_buff *_skb, *tmp; if (skb_queue_empty(list) || less(seqno, buf_seqno(skb_peek(list)))) { __skb_queue_head(list, skb); return true; } if (more(seqno, buf_seqno(skb_peek_tail(list)))) { __skb_queue_tail(list, skb); return true; } skb_queue_walk_safe(list, _skb, tmp) { if (more(seqno, buf_seqno(_skb))) continue; if (seqno == buf_seqno(_skb)) break; __skb_queue_before(list, _skb, skb); return true; } kfree_skb(skb); return false; } void tipc_skb_reject(struct net *net, int err, struct sk_buff *skb, struct sk_buff_head *xmitq) { if (tipc_msg_reverse(tipc_own_addr(net), &skb, err)) __skb_queue_tail(xmitq, skb); }
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 // SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/data.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/buffer_head.h> #include <linux/sched/mm.h> #include <linux/mpage.h> #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/blk-crypto.h> #include <linux/swap.h> #include <linux/prefetch.h> #include <linux/uio.h> #include <linux/sched/signal.h> #include <linux/fiemap.h> #include <linux/iomap.h> #include "f2fs.h" #include "node.h" #include "segment.h" #include "iostat.h" #include <trace/events/f2fs.h> #define NUM_PREALLOC_POST_READ_CTXS 128 static struct kmem_cache *bio_post_read_ctx_cache; static struct kmem_cache *bio_entry_slab; static mempool_t *bio_post_read_ctx_pool; static struct bio_set f2fs_bioset; #define F2FS_BIO_POOL_SIZE NR_CURSEG_TYPE int __init f2fs_init_bioset(void) { return bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); } void f2fs_destroy_bioset(void) { bioset_exit(&f2fs_bioset); } static bool __is_cp_guaranteed(struct page *page) { struct address_space *mapping = page->mapping; struct inode *inode; struct f2fs_sb_info *sbi; if (!mapping) return false; inode = mapping->host; sbi = F2FS_I_SB(inode); if (inode->i_ino == F2FS_META_INO(sbi) || inode->i_ino == F2FS_NODE_INO(sbi) || S_ISDIR(inode->i_mode)) return true; if (f2fs_is_compressed_page(page)) return false; if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) || page_private_gcing(page)) return true; return false; } static enum count_type __read_io_type(struct page *page) { struct address_space *mapping = page_file_mapping(page); if (mapping) { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); if (inode->i_ino == F2FS_META_INO(sbi)) return F2FS_RD_META; if (inode->i_ino == F2FS_NODE_INO(sbi)) return F2FS_RD_NODE; } return F2FS_RD_DATA; } /* postprocessing steps for read bios */ enum bio_post_read_step { #ifdef CONFIG_FS_ENCRYPTION STEP_DECRYPT = BIT(0), #else STEP_DECRYPT = 0, /* compile out the decryption-related code */ #endif #ifdef CONFIG_F2FS_FS_COMPRESSION STEP_DECOMPRESS = BIT(1), #else STEP_DECOMPRESS = 0, /* compile out the decompression-related code */ #endif #ifdef CONFIG_FS_VERITY STEP_VERITY = BIT(2), #else STEP_VERITY = 0, /* compile out the verity-related code */ #endif }; struct bio_post_read_ctx { struct bio *bio; struct f2fs_sb_info *sbi; struct work_struct work; unsigned int enabled_steps; /* * decompression_attempted keeps track of whether * f2fs_end_read_compressed_page() has been called on the pages in the * bio that belong to a compressed cluster yet. */ bool decompression_attempted; block_t fs_blkaddr; }; /* * Update and unlock a bio's pages, and free the bio. * * This marks pages up-to-date only if there was no error in the bio (I/O error, * decryption error, or verity error), as indicated by bio->bi_status. * * "Compressed pages" (pagecache pages backed by a compressed cluster on-disk) * aren't marked up-to-date here, as decompression is done on a per-compression- * cluster basis rather than a per-bio basis. Instead, we only must do two * things for each compressed page here: call f2fs_end_read_compressed_page() * with failed=true if an error occurred before it would have normally gotten * called (i.e., I/O error or decryption error, but *not* verity error), and * release the bio's reference to the decompress_io_ctx of the page's cluster. */ static void f2fs_finish_read_bio(struct bio *bio, bool in_task) { struct bio_vec *bv; struct bvec_iter_all iter_all; struct bio_post_read_ctx *ctx = bio->bi_private; bio_for_each_segment_all(bv, bio, iter_all) { struct page *page = bv->bv_page; if (f2fs_is_compressed_page(page)) { if (ctx && !ctx->decompression_attempted) f2fs_end_read_compressed_page(page, true, 0, in_task); f2fs_put_page_dic(page, in_task); continue; } if (bio->bi_status) ClearPageUptodate(page); else SetPageUptodate(page); dec_page_count(F2FS_P_SB(page), __read_io_type(page)); unlock_page(page); } if (ctx) mempool_free(ctx, bio_post_read_ctx_pool); bio_put(bio); } static void f2fs_verify_bio(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; bool may_have_compressed_pages = (ctx->enabled_steps & STEP_DECOMPRESS); /* * fsverity_verify_bio() may call readahead() again, and while verity * will be disabled for this, decryption and/or decompression may still * be needed, resulting in another bio_post_read_ctx being allocated. * So to prevent deadlocks we need to release the current ctx to the * mempool first. This assumes that verity is the last post-read step. */ mempool_free(ctx, bio_post_read_ctx_pool); bio->bi_private = NULL; /* * Verify the bio's pages with fs-verity. Exclude compressed pages, * as those were handled separately by f2fs_end_read_compressed_page(). */ if (may_have_compressed_pages) { struct bio_vec *bv; struct bvec_iter_all iter_all; bio_for_each_segment_all(bv, bio, iter_all) { struct page *page = bv->bv_page; if (!f2fs_is_compressed_page(page) && !fsverity_verify_page(page)) { bio->bi_status = BLK_STS_IOERR; break; } } } else { fsverity_verify_bio(bio); } f2fs_finish_read_bio(bio, true); } /* * If the bio's data needs to be verified with fs-verity, then enqueue the * verity work for the bio. Otherwise finish the bio now. * * Note that to avoid deadlocks, the verity work can't be done on the * decryption/decompression workqueue. This is because verifying the data pages * can involve reading verity metadata pages from the file, and these verity * metadata pages may be encrypted and/or compressed. */ static void f2fs_verify_and_finish_bio(struct bio *bio, bool in_task) { struct bio_post_read_ctx *ctx = bio->bi_private; if (ctx && (ctx->enabled_steps & STEP_VERITY)) { INIT_WORK(&ctx->work, f2fs_verify_bio); fsverity_enqueue_verify_work(&ctx->work); } else { f2fs_finish_read_bio(bio, in_task); } } /* * Handle STEP_DECOMPRESS by decompressing any compressed clusters whose last * remaining page was read by @ctx->bio. * * Note that a bio may span clusters (even a mix of compressed and uncompressed * clusters) or be for just part of a cluster. STEP_DECOMPRESS just indicates * that the bio includes at least one compressed page. The actual decompression * is done on a per-cluster basis, not a per-bio basis. */ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx, bool in_task) { struct bio_vec *bv; struct bvec_iter_all iter_all; bool all_compressed = true; block_t blkaddr = ctx->fs_blkaddr; bio_for_each_segment_all(bv, ctx->bio, iter_all) { struct page *page = bv->bv_page; if (f2fs_is_compressed_page(page)) f2fs_end_read_compressed_page(page, false, blkaddr, in_task); else all_compressed = false; blkaddr++; } ctx->decompression_attempted = true; /* * Optimization: if all the bio's pages are compressed, then scheduling * the per-bio verity work is unnecessary, as verity will be fully * handled at the compression cluster level. */ if (all_compressed) ctx->enabled_steps &= ~STEP_VERITY; } static void f2fs_post_read_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; if ((ctx->enabled_steps & STEP_DECRYPT) && !fscrypt_decrypt_bio(bio)) { f2fs_finish_read_bio(bio, true); return; } if (ctx->enabled_steps & STEP_DECOMPRESS) f2fs_handle_step_decompress(ctx, true); f2fs_verify_and_finish_bio(bio, true); } static void f2fs_read_end_io(struct bio *bio) { struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio)); struct bio_post_read_ctx *ctx; bool intask = in_task(); iostat_update_and_unbind_ctx(bio); ctx = bio->bi_private; if (time_to_inject(sbi, FAULT_READ_IO)) bio->bi_status = BLK_STS_IOERR; if (bio->bi_status) { f2fs_finish_read_bio(bio, intask); return; } if (ctx) { unsigned int enabled_steps = ctx->enabled_steps & (STEP_DECRYPT | STEP_DECOMPRESS); /* * If we have only decompression step between decompression and * decrypt, we don't need post processing for this. */ if (enabled_steps == STEP_DECOMPRESS && !f2fs_low_mem_mode(sbi)) { f2fs_handle_step_decompress(ctx, intask); } else if (enabled_steps) { INIT_WORK(&ctx->work, f2fs_post_read_work); queue_work(ctx->sbi->post_read_wq, &ctx->work); return; } } f2fs_verify_and_finish_bio(bio, intask); } static void f2fs_write_end_io(struct bio *bio) { struct f2fs_sb_info *sbi; struct bio_vec *bvec; struct bvec_iter_all iter_all; iostat_update_and_unbind_ctx(bio); sbi = bio->bi_private; if (time_to_inject(sbi, FAULT_WRITE_IO)) bio->bi_status = BLK_STS_IOERR; bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; enum count_type type = WB_DATA_TYPE(page); if (page_private_dummy(page)) { clear_page_private_dummy(page); unlock_page(page); mempool_free(page, sbi->write_io_dummy); if (unlikely(bio->bi_status)) f2fs_stop_checkpoint(sbi, true, STOP_CP_REASON_WRITE_FAIL); continue; } fscrypt_finalize_bounce_page(&page); #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_is_compressed_page(page)) { f2fs_compress_write_end_io(bio, page); continue; } #endif if (unlikely(bio->bi_status)) { mapping_set_error(page->mapping, -EIO); if (type == F2FS_WB_CP_DATA) f2fs_stop_checkpoint(sbi, true, STOP_CP_REASON_WRITE_FAIL); } f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) && page->index != nid_of_node(page)); dec_page_count(sbi, type); if (f2fs_in_warm_node_list(sbi, page)) f2fs_del_fsync_node_entry(sbi, page); clear_page_private_gcing(page); end_page_writeback(page); } if (!get_pages(sbi, F2FS_WB_CP_DATA) && wq_has_sleeper(&sbi->cp_wait)) wake_up(&sbi->cp_wait); bio_put(bio); } #ifdef CONFIG_BLK_DEV_ZONED static void f2fs_zone_write_end_io(struct bio *bio) { struct f2fs_bio_info *io = (struct f2fs_bio_info *)bio->bi_private; bio->bi_private = io->bi_private; complete(&io->zone_wait); f2fs_write_end_io(bio); } #endif struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, block_t blk_addr, sector_t *sector) { struct block_device *bdev = sbi->sb->s_bdev; int i; if (f2fs_is_multi_device(sbi)) { for (i = 0; i < sbi->s_ndevs; i++) { if (FDEV(i).start_blk <= blk_addr && FDEV(i).end_blk >= blk_addr) { blk_addr -= FDEV(i).start_blk; bdev = FDEV(i).bdev; break; } } } if (sector) *sector = SECTOR_FROM_BLOCK(blk_addr); return bdev; } int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) { int i; if (!f2fs_is_multi_device(sbi)) return 0; for (i = 0; i < sbi->s_ndevs; i++) if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr) return i; return 0; } static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) { unsigned int temp_mask = GENMASK(NR_TEMP_TYPE - 1, 0); unsigned int fua_flag, meta_flag, io_flag; blk_opf_t op_flags = 0; if (fio->op != REQ_OP_WRITE) return 0; if (fio->type == DATA) io_flag = fio->sbi->data_io_flag; else if (fio->type == NODE) io_flag = fio->sbi->node_io_flag; else return 0; fua_flag = io_flag & temp_mask; meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask; /* * data/node io flag bits per temp: * REQ_META | REQ_FUA | * 5 | 4 | 3 | 2 | 1 | 0 | * Cold | Warm | Hot | Cold | Warm | Hot | */ if (BIT(fio->temp) & meta_flag) op_flags |= REQ_META; if (BIT(fio->temp) & fua_flag) op_flags |= REQ_FUA; return op_flags; } static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) { struct f2fs_sb_info *sbi = fio->sbi; struct block_device *bdev; sector_t sector; struct bio *bio; bdev = f2fs_target_device(sbi, fio->new_blkaddr, &sector); bio = bio_alloc_bioset(bdev, npages, fio->op | fio->op_flags | f2fs_io_flags(fio), GFP_NOIO, &f2fs_bioset); bio->bi_iter.bi_sector = sector; if (is_read_io(fio->op)) { bio->bi_end_io = f2fs_read_end_io; bio->bi_private = NULL; } else { bio->bi_end_io = f2fs_write_end_io; bio->bi_private = sbi; } iostat_alloc_and_bind_ctx(sbi, bio, NULL); if (fio->io_wbc) wbc_init_bio(fio->io_wbc, bio); return bio; } static void f2fs_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, pgoff_t first_idx, const struct f2fs_io_info *fio, gfp_t gfp_mask) { /* * The f2fs garbage collector sets ->encrypted_page when it wants to * read/write raw data without encryption. */ if (!fio || !fio->encrypted_page) fscrypt_set_bio_crypt_ctx(bio, inode, first_idx, gfp_mask); } static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode, pgoff_t next_idx, const struct f2fs_io_info *fio) { /* * The f2fs garbage collector sets ->encrypted_page when it wants to * read/write raw data without encryption. */ if (fio && fio->encrypted_page) return !bio_has_crypt_ctx(bio); return fscrypt_mergeable_bio(bio, inode, next_idx); } void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type) { WARN_ON_ONCE(!is_read_io(bio_op(bio))); trace_f2fs_submit_read_bio(sbi->sb, type, bio); iostat_update_submit_ctx(bio, type); submit_bio(bio); } static void f2fs_align_write_bio(struct f2fs_sb_info *sbi, struct bio *bio) { unsigned int start = (bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS) % F2FS_IO_SIZE(sbi); if (start == 0) return; /* fill dummy pages */ for (; start < F2FS_IO_SIZE(sbi); start++) { struct page *page = mempool_alloc(sbi->write_io_dummy, GFP_NOIO | __GFP_NOFAIL); f2fs_bug_on(sbi, !page); lock_page(page); zero_user_segment(page, 0, PAGE_SIZE); set_page_private_dummy(page); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) f2fs_bug_on(sbi, 1); } } static void f2fs_submit_write_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type) { WARN_ON_ONCE(is_read_io(bio_op(bio))); if (type == DATA || type == NODE) { if (f2fs_lfs_mode(sbi) && current->plug) blk_finish_plug(current->plug); if (F2FS_IO_ALIGNED(sbi)) { f2fs_align_write_bio(sbi, bio); /* * In the NODE case, we lose next block address chain. * So, we need to do checkpoint in f2fs_sync_file. */ if (type == NODE) set_sbi_flag(sbi, SBI_NEED_CP); } } trace_f2fs_submit_write_bio(sbi->sb, type, bio); iostat_update_submit_ctx(bio, type); submit_bio(bio); } static void __submit_merged_bio(struct f2fs_bio_info *io) { struct f2fs_io_info *fio = &io->fio; if (!io->bio) return; if (is_read_io(fio->op)) { trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio); f2fs_submit_read_bio(io->sbi, io->bio, fio->type); } else { trace_f2fs_prepare_write_bio(io->sbi->sb, fio->type, io->bio); f2fs_submit_write_bio(io->sbi, io->bio, fio->type); } io->bio = NULL; } static bool __has_merged_page(struct bio *bio, struct inode *inode, struct page *page, nid_t ino) { struct bio_vec *bvec; struct bvec_iter_all iter_all; if (!bio) return false; if (!inode && !page && !ino) return true; bio_for_each_segment_all(bvec, bio, iter_all) { struct page *target = bvec->bv_page; if (fscrypt_is_bounce_page(target)) { target = fscrypt_pagecache_page(target); if (IS_ERR(target)) continue; } if (f2fs_is_compressed_page(target)) { target = f2fs_compress_control_page(target); if (IS_ERR(target)) continue; } if (inode && inode == target->mapping->host) return true; if (page && page == target) return true; if (ino && ino == ino_of_node(target)) return true; } return false; } int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi) { int i; for (i = 0; i < NR_PAGE_TYPE; i++) { int n = (i == META) ? 1 : NR_TEMP_TYPE; int j; sbi->write_io[i] = f2fs_kmalloc(sbi, array_size(n, sizeof(struct f2fs_bio_info)), GFP_KERNEL); if (!sbi->write_io[i]) return -ENOMEM; for (j = HOT; j < n; j++) { init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem); sbi->write_io[i][j].sbi = sbi; sbi->write_io[i][j].bio = NULL; spin_lock_init(&sbi->write_io[i][j].io_lock); INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list); init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock); #ifdef CONFIG_BLK_DEV_ZONED init_completion(&sbi->write_io[i][j].zone_wait); sbi->write_io[i][j].zone_pending_bio = NULL; sbi->write_io[i][j].bi_private = NULL; #endif } } return 0; } static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type, enum temp_type temp) { enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io = sbi->write_io[btype] + temp; f2fs_down_write(&io->io_rwsem); if (!io->bio) goto unlock_out; /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; io->bio->bi_opf |= REQ_META | REQ_PRIO | REQ_SYNC; if (!test_opt(sbi, NOBARRIER)) io->bio->bi_opf |= REQ_PREFLUSH | REQ_FUA; } __submit_merged_bio(io); unlock_out: f2fs_up_write(&io->io_rwsem); } static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, nid_t ino, enum page_type type, bool force) { enum temp_type temp; bool ret = true; for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { if (!force) { enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io = sbi->write_io[btype] + temp; f2fs_down_read(&io->io_rwsem); ret = __has_merged_page(io->bio, inode, page, ino); f2fs_up_read(&io->io_rwsem); } if (ret) __f2fs_submit_merged_write(sbi, type, temp); /* TODO: use HOT temp only for meta pages now. */ if (type >= META) break; } } void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type) { __submit_merged_write_cond(sbi, NULL, NULL, 0, type, true); } void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, nid_t ino, enum page_type type) { __submit_merged_write_cond(sbi, inode, page, ino, type, false); } void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) { f2fs_submit_merged_write(sbi, DATA); f2fs_submit_merged_write(sbi, NODE); f2fs_submit_merged_write(sbi, META); } /* * Fill the locked page with data located in the block address. * A caller needs to unlock the page on failure. */ int f2fs_submit_page_bio(struct f2fs_io_info *fio) { struct bio *bio; struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, fio->is_por ? META_POR : (__is_meta_io(fio) ? META_GENERIC : DATA_GENERIC_ENHANCE))) { f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; } trace_f2fs_submit_page_bio(page, fio); /* Allocate a new bio */ bio = __bio_alloc(fio, 1); f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host, fio->page->index, fio, GFP_NOIO); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); return -EFAULT; } if (fio->io_wbc && !is_read_io(fio->op)) wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); inc_page_count(fio->sbi, is_read_io(fio->op) ? __read_io_type(page) : WB_DATA_TYPE(fio->page)); if (is_read_io(bio_op(bio))) f2fs_submit_read_bio(fio->sbi, bio, fio->type); else f2fs_submit_write_bio(fio->sbi, bio, fio->type); return 0; } static bool page_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, block_t last_blkaddr, block_t cur_blkaddr) { if (unlikely(sbi->max_io_bytes && bio->bi_iter.bi_size >= sbi->max_io_bytes)) return false; if (last_blkaddr + 1 != cur_blkaddr) return false; return bio->bi_bdev == f2fs_target_device(sbi, cur_blkaddr, NULL); } static bool io_type_is_mergeable(struct f2fs_bio_info *io, struct f2fs_io_info *fio) { if (io->fio.op != fio->op) return false; return io->fio.op_flags == fio->op_flags; } static bool io_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, struct f2fs_bio_info *io, struct f2fs_io_info *fio, block_t last_blkaddr, block_t cur_blkaddr) { if (F2FS_IO_ALIGNED(sbi) && (fio->type == DATA || fio->type == NODE)) { unsigned int filled_blocks = F2FS_BYTES_TO_BLK(bio->bi_iter.bi_size); unsigned int io_size = F2FS_IO_SIZE(sbi); unsigned int left_vecs = bio->bi_max_vecs - bio->bi_vcnt; /* IOs in bio is aligned and left space of vectors is not enough */ if (!(filled_blocks % io_size) && left_vecs < io_size) return false; } if (!page_is_mergeable(sbi, bio, last_blkaddr, cur_blkaddr)) return false; return io_type_is_mergeable(io, fio); } static void add_bio_entry(struct f2fs_sb_info *sbi, struct bio *bio, struct page *page, enum temp_type temp) { struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; struct bio_entry *be; be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS, true, NULL); be->bio = bio; bio_get(bio); if (bio_add_page(bio, page, PAGE_SIZE, 0) != PAGE_SIZE) f2fs_bug_on(sbi, 1); f2fs_down_write(&io->bio_list_lock); list_add_tail(&be->list, &io->bio_list); f2fs_up_write(&io->bio_list_lock); } static void del_bio_entry(struct bio_entry *be) { list_del(&be->list); kmem_cache_free(bio_entry_slab, be); } static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio, struct page *page) { struct f2fs_sb_info *sbi = fio->sbi; enum temp_type temp; bool found = false; int ret = -EAGAIN; for (temp = HOT; temp < NR_TEMP_TYPE && !found; temp++) { struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; struct list_head *head = &io->bio_list; struct bio_entry *be; f2fs_down_write(&io->bio_list_lock); list_for_each_entry(be, head, list) { if (be->bio != *bio) continue; found = true; f2fs_bug_on(sbi, !page_is_mergeable(sbi, *bio, *fio->last_block, fio->new_blkaddr)); if (f2fs_crypt_mergeable_bio(*bio, fio->page->mapping->host, fio->page->index, fio) && bio_add_page(*bio, page, PAGE_SIZE, 0) == PAGE_SIZE) { ret = 0; break; } /* page can't be merged into bio; submit the bio */ del_bio_entry(be); f2fs_submit_write_bio(sbi, *bio, DATA); break; } f2fs_up_write(&io->bio_list_lock); } if (ret) { bio_put(*bio); *bio = NULL; } return ret; } void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, struct bio **bio, struct page *page) { enum temp_type temp; bool found = false; struct bio *target = bio ? *bio : NULL; f2fs_bug_on(sbi, !target && !page); for (temp = HOT; temp < NR_TEMP_TYPE && !found; temp++) { struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; struct list_head *head = &io->bio_list; struct bio_entry *be; if (list_empty(head)) continue; f2fs_down_read(&io->bio_list_lock); list_for_each_entry(be, head, list) { if (target) found = (target == be->bio); else found = __has_merged_page(be->bio, NULL, page, 0); if (found) break; } f2fs_up_read(&io->bio_list_lock); if (!found) continue; found = false; f2fs_down_write(&io->bio_list_lock); list_for_each_entry(be, head, list) { if (target) found = (target == be->bio); else found = __has_merged_page(be->bio, NULL, page, 0); if (found) { target = be->bio; del_bio_entry(be); break; } } f2fs_up_write(&io->bio_list_lock); } if (found) f2fs_submit_write_bio(sbi, target, DATA); if (bio && *bio) { bio_put(*bio); *bio = NULL; } } int f2fs_merge_page_bio(struct f2fs_io_info *fio) { struct bio *bio = *fio->bio; struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) { f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; } trace_f2fs_submit_page_bio(page, fio); if (bio && !page_is_mergeable(fio->sbi, bio, *fio->last_block, fio->new_blkaddr)) f2fs_submit_merged_ipu_write(fio->sbi, &bio, NULL); alloc_new: if (!bio) { bio = __bio_alloc(fio, BIO_MAX_VECS); f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host, fio->page->index, fio, GFP_NOIO); add_bio_entry(fio->sbi, bio, page, fio->temp); } else { if (add_ipu_page(fio, &bio, page)) goto alloc_new; } if (fio->io_wbc) wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); inc_page_count(fio->sbi, WB_DATA_TYPE(page)); *fio->last_block = fio->new_blkaddr; *fio->bio = bio; return 0; } #ifdef CONFIG_BLK_DEV_ZONED static bool is_end_zone_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr) { int devi = 0; if (f2fs_is_multi_device(sbi)) { devi = f2fs_target_device_index(sbi, blkaddr); if (blkaddr < FDEV(devi).start_blk || blkaddr > FDEV(devi).end_blk) { f2fs_err(sbi, "Invalid block %x", blkaddr); return false; } blkaddr -= FDEV(devi).start_blk; } return bdev_is_zoned(FDEV(devi).bdev) && f2fs_blkz_is_seq(sbi, devi, blkaddr) && (blkaddr % sbi->blocks_per_blkz == sbi->blocks_per_blkz - 1); } #endif void f2fs_submit_page_write(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp; struct page *bio_page; f2fs_bug_on(sbi, is_read_io(fio->op)); f2fs_down_write(&io->io_rwsem); #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_has_blkzoned(sbi) && btype < META && io->zone_pending_bio) { wait_for_completion_io(&io->zone_wait); bio_put(io->zone_pending_bio); io->zone_pending_bio = NULL; io->bi_private = NULL; } #endif next: if (fio->in_list) { spin_lock(&io->io_lock); if (list_empty(&io->io_list)) { spin_unlock(&io->io_lock); goto out; } fio = list_first_entry(&io->io_list, struct f2fs_io_info, list); list_del(&fio->list); spin_unlock(&io->io_lock); } verify_fio_blkaddr(fio); if (fio->encrypted_page) bio_page = fio->encrypted_page; else if (fio->compressed_page) bio_page = fio->compressed_page; else bio_page = fio->page; /* set submitted = true as a return value */ fio->submitted = 1; inc_page_count(sbi, WB_DATA_TYPE(bio_page)); if (io->bio && (!io_is_mergeable(sbi, io->bio, io, fio, io->last_block_in_bio, fio->new_blkaddr) || !f2fs_crypt_mergeable_bio(io->bio, fio->page->mapping->host, bio_page->index, fio))) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { if (F2FS_IO_ALIGNED(sbi) && (fio->type == DATA || fio->type == NODE) && fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) { dec_page_count(sbi, WB_DATA_TYPE(bio_page)); fio->retry = 1; goto skip; } io->bio = __bio_alloc(fio, BIO_MAX_VECS); f2fs_set_bio_crypt_ctx(io->bio, fio->page->mapping->host, bio_page->index, fio, GFP_NOIO); io->fio = *fio; } if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) { __submit_merged_bio(io); goto alloc_new; } if (fio->io_wbc) wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); io->last_block_in_bio = fio->new_blkaddr; trace_f2fs_submit_page_write(fio->page, fio); skip: if (fio->in_list) goto next; out: #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_has_blkzoned(sbi) && btype < META && is_end_zone_blkaddr(sbi, fio->new_blkaddr)) { bio_get(io->bio); reinit_completion(&io->zone_wait); io->bi_private = io->bio->bi_private; io->bio->bi_private = io; io->bio->bi_end_io = f2fs_zone_write_end_io; io->zone_pending_bio = io->bio; __submit_merged_bio(io); } #endif if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || !f2fs_is_checkpoint_ready(sbi)) __submit_merged_bio(io); f2fs_up_write(&io->io_rwsem); } static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, unsigned nr_pages, blk_opf_t op_flag, pgoff_t first_idx, bool for_write) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; struct bio_post_read_ctx *ctx = NULL; unsigned int post_read_steps = 0; sector_t sector; struct block_device *bdev = f2fs_target_device(sbi, blkaddr, &sector); bio = bio_alloc_bioset(bdev, bio_max_segs(nr_pages), REQ_OP_READ | op_flag, for_write ? GFP_NOIO : GFP_KERNEL, &f2fs_bioset); if (!bio) return ERR_PTR(-ENOMEM); bio->bi_iter.bi_sector = sector; f2fs_set_bio_crypt_ctx(bio, inode, first_idx, NULL, GFP_NOFS); bio->bi_end_io = f2fs_read_end_io; if (fscrypt_inode_uses_fs_layer_crypto(inode)) post_read_steps |= STEP_DECRYPT; if (f2fs_need_verity(inode, first_idx)) post_read_steps |= STEP_VERITY; /* * STEP_DECOMPRESS is handled specially, since a compressed file might * contain both compressed and uncompressed clusters. We'll allocate a * bio_post_read_ctx if the file is compressed, but the caller is * responsible for enabling STEP_DECOMPRESS if it's actually needed. */ if (post_read_steps || f2fs_compressed_file(inode)) { /* Due to the mempool, this never fails. */ ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); ctx->bio = bio; ctx->sbi = sbi; ctx->enabled_steps = post_read_steps; ctx->fs_blkaddr = blkaddr; ctx->decompression_attempted = false; bio->bi_private = ctx; } iostat_alloc_and_bind_ctx(sbi, bio, ctx); return bio; } /* This can handle encryption stuffs */ static int f2fs_submit_page_read(struct inode *inode, struct page *page, block_t blkaddr, blk_opf_t op_flags, bool for_write) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags, page->index, for_write); if (IS_ERR(bio)) return PTR_ERR(bio); /* wait for GCed page writeback via META_MAPPING */ f2fs_wait_on_block_writeback(inode, blkaddr); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { iostat_update_and_unbind_ctx(bio); if (bio->bi_private) mempool_free(bio->bi_private, bio_post_read_ctx_pool); bio_put(bio); return -EFAULT; } inc_page_count(sbi, F2FS_RD_DATA); f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); f2fs_submit_read_bio(sbi, bio, DATA); return 0; } static void __set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) { __le32 *addr = get_dnode_addr(dn->inode, dn->node_page); dn->data_blkaddr = blkaddr; addr[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); } /* * Lock ordering for the change of data block address: * ->data_page * ->node_page * update block addresses in the node page */ void f2fs_set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) { f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true); __set_data_blkaddr(dn, blkaddr); if (set_page_dirty(dn->node_page)) dn->node_changed = true; } void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) { f2fs_set_data_blkaddr(dn, blkaddr); f2fs_update_read_extent_cache(dn); } /* dn->ofs_in_node will be returned with up-to-date last block pointer */ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); int err; if (!count) return 0; if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count)))) return err; trace_f2fs_reserve_new_blocks(dn->inode, dn->nid, dn->ofs_in_node, count); f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true); for (; count > 0; dn->ofs_in_node++) { block_t blkaddr = f2fs_data_blkaddr(dn); if (blkaddr == NULL_ADDR) { __set_data_blkaddr(dn, NEW_ADDR); count--; } } if (set_page_dirty(dn->node_page)) dn->node_changed = true; return 0; } /* Should keep dn->ofs_in_node unchanged */ int f2fs_reserve_new_block(struct dnode_of_data *dn) { unsigned int ofs_in_node = dn->ofs_in_node; int ret; ret = f2fs_reserve_new_blocks(dn, 1); dn->ofs_in_node = ofs_in_node; return ret; } int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) { bool need_put = dn->inode_page ? false : true; int err; err = f2fs_get_dnode_of_data(dn, index, ALLOC_NODE); if (err) return err; if (dn->data_blkaddr == NULL_ADDR) err = f2fs_reserve_new_block(dn); if (err || need_put) f2fs_put_dnode(dn); return err; } struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; int err; page = f2fs_grab_cache_page(mapping, index, for_write); if (!page) return ERR_PTR(-ENOMEM); if (f2fs_lookup_read_extent_cache_block(inode, index, &dn.data_blkaddr)) { if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; f2fs_handle_error(F2FS_I_SB(inode), ERROR_INVALID_BLKADDR); goto put_err; } goto got_it; } set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err) { if (err == -ENOENT && next_pgofs) *next_pgofs = f2fs_get_next_page_offset(&dn, index); goto put_err; } f2fs_put_dnode(&dn); if (unlikely(dn.data_blkaddr == NULL_ADDR)) { err = -ENOENT; if (next_pgofs) *next_pgofs = index + 1; goto put_err; } if (dn.data_blkaddr != NEW_ADDR && !f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; f2fs_handle_error(F2FS_I_SB(inode), ERROR_INVALID_BLKADDR); goto put_err; } got_it: if (PageUptodate(page)) { unlock_page(page); return page; } /* * A new dentry page is allocated but not able to be written, since its * new inode page couldn't be allocated due to -ENOSPC. * In such the case, its blkaddr can be remained as NEW_ADDR. * see, f2fs_add_link -> f2fs_get_new_data_page -> * f2fs_init_inode_metadata. */ if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_SIZE); if (!PageUptodate(page)) SetPageUptodate(page); unlock_page(page); return page; } err = f2fs_submit_page_read(inode, page, dn.data_blkaddr, op_flags, for_write); if (err) goto put_err; return page; put_err: f2fs_put_page(page, 1); return ERR_PTR(err); } struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, pgoff_t *next_pgofs) { struct address_space *mapping = inode->i_mapping; struct page *page; page = find_get_page(mapping, index); if (page && PageUptodate(page)) return page; f2fs_put_page(page, 0); page = f2fs_get_read_data_page(inode, index, 0, false, next_pgofs); if (IS_ERR(page)) return page; if (PageUptodate(page)) return page; wait_on_page_locked(page); if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 0); return ERR_PTR(-EIO); } return page; } /* * If it tries to access a hole, return an error. * Because, the callers, functions in dir.c and GC, should be able to know * whether this page exists or not. */ struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, bool for_write) { struct address_space *mapping = inode->i_mapping; struct page *page; page = f2fs_get_read_data_page(inode, index, 0, for_write, NULL); if (IS_ERR(page)) return page; /* wait for read completion */ lock_page(page); if (unlikely(page->mapping != mapping || !PageUptodate(page))) { f2fs_put_page(page, 1); return ERR_PTR(-EIO); } return page; } /* * Caller ensures that this data page is never allocated. * A new zero-filled data page is allocated in the page cache. * * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and * f2fs_unlock_op(). * Note that, ipage is set only by make_empty_dir, and if any error occur, * ipage should be released by this function. */ struct page *f2fs_get_new_data_page(struct inode *inode, struct page *ipage, pgoff_t index, bool new_i_size) { struct address_space *mapping = inode->i_mapping; struct page *page; struct dnode_of_data dn; int err; page = f2fs_grab_cache_page(mapping, index, true); if (!page) { /* * before exiting, we should make sure ipage will be released * if any error occur. */ f2fs_put_page(ipage, 1); return ERR_PTR(-ENOMEM); } set_new_dnode(&dn, inode, ipage, NULL, 0); err = f2fs_reserve_block(&dn, index); if (err) { f2fs_put_page(page, 1); return ERR_PTR(err); } if (!ipage) f2fs_put_dnode(&dn); if (PageUptodate(page)) goto got_it; if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_SIZE); if (!PageUptodate(page)) SetPageUptodate(page); } else { f2fs_put_page(page, 1); /* if ipage exists, blkaddr should be NEW_ADDR */ f2fs_bug_on(F2FS_I_SB(inode), ipage); page = f2fs_get_lock_data_page(inode, index, true); if (IS_ERR(page)) return page; } got_it: if (new_i_size && i_size_read(inode) < ((loff_t)(index + 1) << PAGE_SHIFT)) f2fs_i_size_write(inode, ((loff_t)(index + 1) << PAGE_SHIFT)); return page; } static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_summary sum; struct node_info ni; block_t old_blkaddr; blkcnt_t count = 1; int err; if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; err = f2fs_get_node_info(sbi, dn->nid, &ni, false); if (err) return err; dn->data_blkaddr = f2fs_data_blkaddr(dn); if (dn->data_blkaddr == NULL_ADDR) { err = inc_valid_block_count(sbi, dn->inode, &count); if (unlikely(err)) return err; } set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); old_blkaddr = dn->data_blkaddr; f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr, &sum, seg_type, NULL); if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) f2fs_invalidate_internal_cache(sbi, old_blkaddr); f2fs_update_data_blkaddr(dn, dn->data_blkaddr); return 0; } static void f2fs_map_lock(struct f2fs_sb_info *sbi, int flag) { if (flag == F2FS_GET_BLOCK_PRE_AIO) f2fs_down_read(&sbi->node_change); else f2fs_lock_op(sbi); } static void f2fs_map_unlock(struct f2fs_sb_info *sbi, int flag) { if (flag == F2FS_GET_BLOCK_PRE_AIO) f2fs_up_read(&sbi->node_change); else f2fs_unlock_op(sbi); } int f2fs_get_block_locked(struct dnode_of_data *dn, pgoff_t index) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); int err = 0; f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); if (!f2fs_lookup_read_extent_cache_block(dn->inode, index, &dn->data_blkaddr)) err = f2fs_reserve_block(dn, index); f2fs_map_unlock(sbi, F2FS_GET_BLOCK_PRE_AIO); return err; } static int f2fs_map_no_dnode(struct inode *inode, struct f2fs_map_blocks *map, struct dnode_of_data *dn, pgoff_t pgoff) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); /* * There is one exceptional case that read_node_page() may return * -ENOENT due to filesystem has been shutdown or cp_error, return * -EIO in that case. */ if (map->m_may_create && (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || f2fs_cp_error(sbi))) return -EIO; if (map->m_next_pgofs) *map->m_next_pgofs = f2fs_get_next_page_offset(dn, pgoff); if (map->m_next_extent) *map->m_next_extent = f2fs_get_next_page_offset(dn, pgoff); return 0; } static bool f2fs_map_blocks_cached(struct inode *inode, struct f2fs_map_blocks *map, int flag) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int maxblocks = map->m_len; pgoff_t pgoff = (pgoff_t)map->m_lblk; struct extent_info ei = {}; if (!f2fs_lookup_read_extent_cache(inode, pgoff, &ei)) return false; map->m_pblk = ei.blk + pgoff - ei.fofs; map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgoff); map->m_flags = F2FS_MAP_MAPPED; if (map->m_next_extent) *map->m_next_extent = pgoff + map->m_len; /* for hardware encryption, but to avoid potential issue in future */ if (flag == F2FS_GET_BLOCK_DIO) f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); if (f2fs_allow_multi_device_dio(sbi, flag)) { int bidx = f2fs_target_device_index(sbi, map->m_pblk); struct f2fs_dev_info *dev = &sbi->devs[bidx]; map->m_bdev = dev->bdev; map->m_pblk -= dev->start_blk; map->m_len = min(map->m_len, dev->end_blk + 1 - map->m_pblk); } else { map->m_bdev = inode->i_sb->s_bdev; } return true; } /* * f2fs_map_blocks() tries to find or build mapping relationship which * maps continuous logical blocks to physical blocks, and return such * info via f2fs_map_blocks structure. */ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag) { unsigned int maxblocks = map->m_len; struct dnode_of_data dn; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int mode = map->m_may_create ? ALLOC_NODE : LOOKUP_NODE; pgoff_t pgofs, end_offset, end; int err = 0, ofs = 1; unsigned int ofs_in_node, last_ofs_in_node; blkcnt_t prealloc; block_t blkaddr; unsigned int start_pgofs; int bidx = 0; bool is_hole; if (!maxblocks) return 0; if (!map->m_may_create && f2fs_map_blocks_cached(inode, map, flag)) goto out; map->m_bdev = inode->i_sb->s_bdev; map->m_multidev_dio = f2fs_allow_multi_device_dio(F2FS_I_SB(inode), flag); map->m_len = 0; map->m_flags = 0; /* it only supports block size == page size */ pgofs = (pgoff_t)map->m_lblk; end = pgofs + maxblocks; next_dnode: if (map->m_may_create) f2fs_map_lock(sbi, flag); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, pgofs, mode); if (err) { if (flag == F2FS_GET_BLOCK_BMAP) map->m_pblk = 0; if (err == -ENOENT) err = f2fs_map_no_dnode(inode, map, &dn, pgofs); goto unlock_out; } start_pgofs = pgofs; prealloc = 0; last_ofs_in_node = ofs_in_node = dn.ofs_in_node; end_offset = ADDRS_PER_PAGE(dn.node_page, inode); next_block: blkaddr = f2fs_data_blkaddr(&dn); is_hole = !__is_valid_data_blkaddr(blkaddr); if (!is_hole && !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto sync_out; } /* use out-place-update for direct IO under LFS mode */ if (map->m_may_create && (is_hole || (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO))) { if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; goto sync_out; } switch (flag) { case F2FS_GET_BLOCK_PRE_AIO: if (blkaddr == NULL_ADDR) { prealloc++; last_ofs_in_node = dn.ofs_in_node; } break; case F2FS_GET_BLOCK_PRE_DIO: case F2FS_GET_BLOCK_DIO: err = __allocate_data_block(&dn, map->m_seg_type); if (err) goto sync_out; if (flag == F2FS_GET_BLOCK_PRE_DIO) file_need_truncate(inode); set_inode_flag(inode, FI_APPEND_WRITE); break; default: WARN_ON_ONCE(1); err = -EIO; goto sync_out; } blkaddr = dn.data_blkaddr; if (is_hole) map->m_flags |= F2FS_MAP_NEW; } else if (is_hole) { if (f2fs_compressed_file(inode) && f2fs_sanity_check_cluster(&dn)) { err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_CORRUPTED_CLUSTER); goto sync_out; } switch (flag) { case F2FS_GET_BLOCK_PRECACHE: goto sync_out; case F2FS_GET_BLOCK_BMAP: map->m_pblk = 0; goto sync_out; case F2FS_GET_BLOCK_FIEMAP: if (blkaddr == NULL_ADDR) { if (map->m_next_pgofs) *map->m_next_pgofs = pgofs + 1; goto sync_out; } break; default: /* for defragment case */ if (map->m_next_pgofs) *map->m_next_pgofs = pgofs + 1; goto sync_out; } } if (flag == F2FS_GET_BLOCK_PRE_AIO) goto skip; if (map->m_multidev_dio) bidx = f2fs_target_device_index(sbi, blkaddr); if (map->m_len == 0) { /* reserved delalloc block should be mapped for fiemap. */ if (blkaddr == NEW_ADDR) map->m_flags |= F2FS_MAP_DELALLOC; map->m_flags |= F2FS_MAP_MAPPED; map->m_pblk = blkaddr; map->m_len = 1; if (map->m_multidev_dio) map->m_bdev = FDEV(bidx).bdev; } else if ((map->m_pblk != NEW_ADDR && blkaddr == (map->m_pblk + ofs)) || (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) || flag == F2FS_GET_BLOCK_PRE_DIO) { if (map->m_multidev_dio && map->m_bdev != FDEV(bidx).bdev) goto sync_out; ofs++; map->m_len++; } else { goto sync_out; } skip: dn.ofs_in_node++; pgofs++; /* preallocate blocks in batch for one dnode page */ if (flag == F2FS_GET_BLOCK_PRE_AIO && (pgofs == end || dn.ofs_in_node == end_offset)) { dn.ofs_in_node = ofs_in_node; err = f2fs_reserve_new_blocks(&dn, prealloc); if (err) goto sync_out; map->m_len += dn.ofs_in_node - ofs_in_node; if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { err = -ENOSPC; goto sync_out; } dn.ofs_in_node = end_offset; } if (pgofs >= end) goto sync_out; else if (dn.ofs_in_node < end_offset) goto next_block; if (flag == F2FS_GET_BLOCK_PRECACHE) { if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; f2fs_update_read_extent_cache_range(&dn, start_pgofs, map->m_pblk + ofs, map->m_len - ofs); } } f2fs_put_dnode(&dn); if (map->m_may_create) { f2fs_map_unlock(sbi, flag); f2fs_balance_fs(sbi, dn.node_changed); } goto next_dnode; sync_out: if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) { /* * for hardware encryption, but to avoid potential issue * in future */ f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); if (map->m_multidev_dio) { block_t blk_addr = map->m_pblk; bidx = f2fs_target_device_index(sbi, map->m_pblk); map->m_bdev = FDEV(bidx).bdev; map->m_pblk -= FDEV(bidx).start_blk; if (map->m_may_create) f2fs_update_device_state(sbi, inode->i_ino, blk_addr, map->m_len); f2fs_bug_on(sbi, blk_addr + map->m_len > FDEV(bidx).end_blk + 1); } } if (flag == F2FS_GET_BLOCK_PRECACHE) { if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; f2fs_update_read_extent_cache_range(&dn, start_pgofs, map->m_pblk + ofs, map->m_len - ofs); } if (map->m_next_extent) *map->m_next_extent = pgofs + 1; } f2fs_put_dnode(&dn); unlock_out: if (map->m_may_create) { f2fs_map_unlock(sbi, flag); f2fs_balance_fs(sbi, dn.node_changed); } out: trace_f2fs_map_blocks(inode, map, flag, err); return err; } bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) { struct f2fs_map_blocks map; block_t last_lblk; int err; if (pos + len > i_size_read(inode)) return false; map.m_lblk = F2FS_BYTES_TO_BLK(pos); map.m_next_pgofs = NULL; map.m_next_extent = NULL; map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; last_lblk = F2FS_BLK_ALIGN(pos + len); while (map.m_lblk < last_lblk) { map.m_len = last_lblk - map.m_lblk; err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT); if (err || map.m_len == 0) return false; map.m_lblk += map.m_len; } return true; } static inline u64 bytes_to_blks(struct inode *inode, u64 bytes) { return (bytes >> inode->i_blkbits); } static inline u64 blks_to_bytes(struct inode *inode, u64 blks) { return (blks << inode->i_blkbits); } static int f2fs_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page; struct node_info ni; __u64 phys = 0, len; __u32 flags; nid_t xnid = F2FS_I(inode)->i_xattr_nid; int err = 0; if (f2fs_has_inline_xattr(inode)) { int offset; page = f2fs_grab_cache_page(NODE_MAPPING(sbi), inode->i_ino, false); if (!page) return -ENOMEM; err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false); if (err) { f2fs_put_page(page, 1); return err; } phys = blks_to_bytes(inode, ni.blk_addr); offset = offsetof(struct f2fs_inode, i_addr) + sizeof(__le32) * (DEF_ADDRS_PER_INODE - get_inline_xattr_addrs(inode)); phys += offset; len = inline_xattr_size(inode); f2fs_put_page(page, 1); flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED; if (!xnid) flags |= FIEMAP_EXTENT_LAST; err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); trace_f2fs_fiemap(inode, 0, phys, len, flags, err); if (err) return err; } if (xnid) { page = f2fs_grab_cache_page(NODE_MAPPING(sbi), xnid, false); if (!page) return -ENOMEM; err = f2fs_get_node_info(sbi, xnid, &ni, false); if (err) { f2fs_put_page(page, 1); return err; } phys = blks_to_bytes(inode, ni.blk_addr); len = inode->i_sb->s_blocksize; f2fs_put_page(page, 1); flags = FIEMAP_EXTENT_LAST; } if (phys) { err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); trace_f2fs_fiemap(inode, 0, phys, len, flags, err); } return (err < 0 ? err : 0); } static loff_t max_inode_blocks(struct inode *inode) { loff_t result = ADDRS_PER_INODE(inode); loff_t leaf_count = ADDRS_PER_BLOCK(inode); /* two direct node blocks */ result += (leaf_count * 2); /* two indirect node blocks */ leaf_count *= NIDS_PER_BLOCK; result += (leaf_count * 2); /* one double indirect node block */ leaf_count *= NIDS_PER_BLOCK; result += leaf_count; return result; } int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { struct f2fs_map_blocks map; sector_t start_blk, last_blk; pgoff_t next_pgofs; u64 logical = 0, phys = 0, size = 0; u32 flags = 0; int ret = 0; bool compr_cluster = false, compr_appended; unsigned int cluster_size = F2FS_I(inode)->i_cluster_size; unsigned int count_in_cluster = 0; loff_t maxbytes; if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { ret = f2fs_precache_extents(inode); if (ret) return ret; } ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_XATTR); if (ret) return ret; inode_lock_shared(inode); maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS; if (start > maxbytes) { ret = -EFBIG; goto out; } if (len > maxbytes || (maxbytes - len) < start) len = maxbytes - start; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { ret = f2fs_xattr_fiemap(inode, fieinfo); goto out; } if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) { ret = f2fs_inline_data_fiemap(inode, fieinfo, start, len); if (ret != -EAGAIN) goto out; } if (bytes_to_blks(inode, len) == 0) len = blks_to_bytes(inode, 1); start_blk = bytes_to_blks(inode, start); last_blk = bytes_to_blks(inode, start + len - 1); next: memset(&map, 0, sizeof(map)); map.m_lblk = start_blk; map.m_len = bytes_to_blks(inode, len); map.m_next_pgofs = &next_pgofs; map.m_seg_type = NO_CHECK_TYPE; if (compr_cluster) { map.m_lblk += 1; map.m_len = cluster_size - count_in_cluster; } ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_FIEMAP); if (ret) goto out; /* HOLE */ if (!compr_cluster && !(map.m_flags & F2FS_MAP_FLAGS)) { start_blk = next_pgofs; if (blks_to_bytes(inode, start_blk) < blks_to_bytes(inode, max_inode_blocks(inode))) goto prep_next; flags |= FIEMAP_EXTENT_LAST; } compr_appended = false; /* In a case of compressed cluster, append this to the last extent */ if (compr_cluster && ((map.m_flags & F2FS_MAP_DELALLOC) || !(map.m_flags & F2FS_MAP_FLAGS))) { compr_appended = true; goto skip_fill; } if (size) { flags |= FIEMAP_EXTENT_MERGED; if (IS_ENCRYPTED(inode)) flags |= FIEMAP_EXTENT_DATA_ENCRYPTED; ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); trace_f2fs_fiemap(inode, logical, phys, size, flags, ret); if (ret) goto out; size = 0; } if (start_blk > last_blk) goto out; skip_fill: if (map.m_pblk == COMPRESS_ADDR) { compr_cluster = true; count_in_cluster = 1; } else if (compr_appended) { unsigned int appended_blks = cluster_size - count_in_cluster + 1; size += blks_to_bytes(inode, appended_blks); start_blk += appended_blks; compr_cluster = false; } else { logical = blks_to_bytes(inode, start_blk); phys = __is_valid_data_blkaddr(map.m_pblk) ? blks_to_bytes(inode, map.m_pblk) : 0; size = blks_to_bytes(inode, map.m_len); flags = 0; if (compr_cluster) { flags = FIEMAP_EXTENT_ENCODED; count_in_cluster += map.m_len; if (count_in_cluster == cluster_size) { compr_cluster = false; size += blks_to_bytes(inode, 1); } } else if (map.m_flags & F2FS_MAP_DELALLOC) { flags = FIEMAP_EXTENT_UNWRITTEN; } start_blk += bytes_to_blks(inode, size); } prep_next: cond_resched(); if (fatal_signal_pending(current)) ret = -EINTR; else goto next; out: if (ret == 1) ret = 0; inode_unlock_shared(inode); return ret; } static inline loff_t f2fs_readpage_limit(struct inode *inode) { if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) return inode->i_sb->s_maxbytes; return i_size_read(inode); } static int f2fs_read_single_page(struct inode *inode, struct page *page, unsigned nr_pages, struct f2fs_map_blocks *map, struct bio **bio_ret, sector_t *last_block_in_bio, bool is_readahead) { struct bio *bio = *bio_ret; const unsigned blocksize = blks_to_bytes(inode, 1); sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; sector_t block_nr; int ret = 0; block_in_file = (sector_t)page_index(page); last_block = block_in_file + nr_pages; last_block_in_file = bytes_to_blks(inode, f2fs_readpage_limit(inode) + blocksize - 1); if (last_block > last_block_in_file) last_block = last_block_in_file; /* just zeroing out page which is beyond EOF */ if (block_in_file >= last_block) goto zero_out; /* * Map blocks using the previous result first. */ if ((map->m_flags & F2FS_MAP_MAPPED) && block_in_file > map->m_lblk && block_in_file < (map->m_lblk + map->m_len)) goto got_it; /* * Then do more f2fs_map_blocks() calls until we are * done with this page. */ map->m_lblk = block_in_file; map->m_len = last_block - block_in_file; ret = f2fs_map_blocks(inode, map, F2FS_GET_BLOCK_DEFAULT); if (ret) goto out; got_it: if ((map->m_flags & F2FS_MAP_MAPPED)) { block_nr = map->m_pblk + block_in_file - map->m_lblk; SetPageMappedToDisk(page); if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr, DATA_GENERIC_ENHANCE_READ)) { ret = -EFSCORRUPTED; f2fs_handle_error(F2FS_I_SB(inode), ERROR_INVALID_BLKADDR); goto out; } } else { zero_out: zero_user_segment(page, 0, PAGE_SIZE); if (f2fs_need_verity(inode, page->index) && !fsverity_verify_page(page)) { ret = -EIO; goto out; } if (!PageUptodate(page)) SetPageUptodate(page); unlock_page(page); goto out; } /* * This page will go to BIO. Do we need to send this * BIO off first? */ if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio, *last_block_in_bio, block_nr) || !f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) { submit_and_realloc: f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } if (bio == NULL) { bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, is_readahead ? REQ_RAHEAD : 0, page->index, false); if (IS_ERR(bio)) { ret = PTR_ERR(bio); bio = NULL; goto out; } } /* * If the page is under writeback, we need to wait for * its completion to see the correct decrypted data. */ f2fs_wait_on_block_writeback(inode, block_nr); if (bio_add_page(bio, page, blocksize, 0) < blocksize) goto submit_and_realloc; inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA); f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); *last_block_in_bio = block_nr; out: *bio_ret = bio; return ret; } #ifdef CONFIG_F2FS_FS_COMPRESSION int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, bool is_readahead, bool for_write) { struct dnode_of_data dn; struct inode *inode = cc->inode; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio = *bio_ret; unsigned int start_idx = cc->cluster_idx << cc->log_cluster_size; sector_t last_block_in_file; const unsigned blocksize = blks_to_bytes(inode, 1); struct decompress_io_ctx *dic = NULL; struct extent_info ei = {}; bool from_dnode = true; int i; int ret = 0; f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc)); last_block_in_file = bytes_to_blks(inode, f2fs_readpage_limit(inode) + blocksize - 1); /* get rid of pages beyond EOF */ for (i = 0; i < cc->cluster_size; i++) { struct page *page = cc->rpages[i]; if (!page) continue; if ((sector_t)page->index >= last_block_in_file) { zero_user_segment(page, 0, PAGE_SIZE); if (!PageUptodate(page)) SetPageUptodate(page); } else if (!PageUptodate(page)) { continue; } unlock_page(page); if (for_write) put_page(page); cc->rpages[i] = NULL; cc->nr_rpages--; } /* we are done since all pages are beyond EOF */ if (f2fs_cluster_is_empty(cc)) goto out; if (f2fs_lookup_read_extent_cache(inode, start_idx, &ei)) from_dnode = false; if (!from_dnode) goto skip_reading_dnode; set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); if (ret) goto out; if (unlikely(f2fs_cp_error(sbi))) { ret = -EIO; goto out_put_dnode; } f2fs_bug_on(sbi, dn.data_blkaddr != COMPRESS_ADDR); skip_reading_dnode: for (i = 1; i < cc->cluster_size; i++) { block_t blkaddr; blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i) : ei.blk + i - 1; if (!__is_valid_data_blkaddr(blkaddr)) break; if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) { ret = -EFAULT; goto out_put_dnode; } cc->nr_cpages++; if (!from_dnode && i >= ei.c_len) break; } /* nothing to decompress */ if (cc->nr_cpages == 0) { ret = 0; goto out_put_dnode; } dic = f2fs_alloc_dic(cc); if (IS_ERR(dic)) { ret = PTR_ERR(dic); goto out_put_dnode; } for (i = 0; i < cc->nr_cpages; i++) { struct page *page = dic->cpages[i]; block_t blkaddr; struct bio_post_read_ctx *ctx; blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i + 1) : ei.blk + i; f2fs_wait_on_block_writeback(inode, blkaddr); if (f2fs_load_compressed_page(sbi, page, blkaddr)) { if (atomic_dec_and_test(&dic->remaining_pages)) { f2fs_decompress_cluster(dic, true); break; } continue; } if (bio && (!page_is_mergeable(sbi, bio, *last_block_in_bio, blkaddr) || !f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) { submit_and_realloc: f2fs_submit_read_bio(sbi, bio, DATA); bio = NULL; } if (!bio) { bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages, is_readahead ? REQ_RAHEAD : 0, page->index, for_write); if (IS_ERR(bio)) { ret = PTR_ERR(bio); f2fs_decompress_end_io(dic, ret, true); f2fs_put_dnode(&dn); *bio_ret = NULL; return ret; } } if (bio_add_page(bio, page, blocksize, 0) < blocksize) goto submit_and_realloc; ctx = get_post_read_ctx(bio); ctx->enabled_steps |= STEP_DECOMPRESS; refcount_inc(&dic->refcnt); inc_page_count(sbi, F2FS_RD_DATA); f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE); *last_block_in_bio = blkaddr; } if (from_dnode) f2fs_put_dnode(&dn); *bio_ret = bio; return 0; out_put_dnode: if (from_dnode) f2fs_put_dnode(&dn); out: for (i = 0; i < cc->cluster_size; i++) { if (cc->rpages[i]) { ClearPageUptodate(cc->rpages[i]); unlock_page(cc->rpages[i]); } } *bio_ret = bio; return ret; } #endif /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. */ static int f2fs_mpage_readpages(struct inode *inode, struct readahead_control *rac, struct page *page) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; struct f2fs_map_blocks map; #ifdef CONFIG_F2FS_FS_COMPRESSION struct compress_ctx cc = { .inode = inode, .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, .cluster_size = F2FS_I(inode)->i_cluster_size, .cluster_idx = NULL_CLUSTER, .rpages = NULL, .cpages = NULL, .nr_rpages = 0, .nr_cpages = 0, }; pgoff_t nc_cluster_idx = NULL_CLUSTER; #endif unsigned nr_pages = rac ? readahead_count(rac) : 1; unsigned max_nr_pages = nr_pages; int ret = 0; map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; map.m_flags = 0; map.m_next_pgofs = NULL; map.m_next_extent = NULL; map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; for (; nr_pages; nr_pages--) { if (rac) { page = readahead_page(rac); prefetchw(&page->flags); } #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { /* there are remained compressed pages, submit them */ if (!f2fs_cluster_can_merge_page(&cc, page->index)) { ret = f2fs_read_multi_pages(&cc, &bio, max_nr_pages, &last_block_in_bio, rac != NULL, false); f2fs_destroy_compress_ctx(&cc, false); if (ret) goto set_error_page; } if (cc.cluster_idx == NULL_CLUSTER) { if (nc_cluster_idx == page->index >> cc.log_cluster_size) { goto read_single_page; } ret = f2fs_is_compressed_cluster(inode, page->index); if (ret < 0) goto set_error_page; else if (!ret) { nc_cluster_idx = page->index >> cc.log_cluster_size; goto read_single_page; } nc_cluster_idx = NULL_CLUSTER; } ret = f2fs_init_compress_ctx(&cc); if (ret) goto set_error_page; f2fs_compress_ctx_add_page(&cc, page); goto next_page; } read_single_page: #endif ret = f2fs_read_single_page(inode, page, max_nr_pages, &map, &bio, &last_block_in_bio, rac); if (ret) { #ifdef CONFIG_F2FS_FS_COMPRESSION set_error_page: #endif zero_user_segment(page, 0, PAGE_SIZE); unlock_page(page); } #ifdef CONFIG_F2FS_FS_COMPRESSION next_page: #endif if (rac) put_page(page); #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { /* last page */ if (nr_pages == 1 && !f2fs_cluster_is_empty(&cc)) { ret = f2fs_read_multi_pages(&cc, &bio, max_nr_pages, &last_block_in_bio, rac != NULL, false); f2fs_destroy_compress_ctx(&cc, false); } } #endif } if (bio) f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); return ret; } static int f2fs_read_data_folio(struct file *file, struct folio *folio) { struct page *page = &folio->page; struct inode *inode = page_file_mapping(page)->host; int ret = -EAGAIN; trace_f2fs_readpage(page, DATA); if (!f2fs_is_compress_backend_ready(inode)) { unlock_page(page); return -EOPNOTSUPP; } /* If the file has inline data, try to read it directly */ if (f2fs_has_inline_data(inode)) ret = f2fs_read_inline_data(inode, page); if (ret == -EAGAIN) ret = f2fs_mpage_readpages(inode, NULL, page); return ret; } static void f2fs_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; trace_f2fs_readpages(inode, readahead_index(rac), readahead_count(rac)); if (!f2fs_is_compress_backend_ready(inode)) return; /* If the file has inline data, skip readahead */ if (f2fs_has_inline_data(inode)) return; f2fs_mpage_readpages(inode, rac, NULL); } int f2fs_encrypt_one_page(struct f2fs_io_info *fio) { struct inode *inode = fio->page->mapping->host; struct page *mpage, *page; gfp_t gfp_flags = GFP_NOFS; if (!f2fs_encrypted_file(inode)) return 0; page = fio->compressed_page ? fio->compressed_page : fio->page; if (fscrypt_inode_uses_inline_crypto(inode)) return 0; retry_encrypt: fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page, PAGE_SIZE, 0, gfp_flags); if (IS_ERR(fio->encrypted_page)) { /* flush pending IOs and wait for a while in the ENOMEM case */ if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { f2fs_flush_merged_writes(fio->sbi); memalloc_retry_wait(GFP_NOFS); gfp_flags |= __GFP_NOFAIL; goto retry_encrypt; } return PTR_ERR(fio->encrypted_page); } mpage = find_lock_page(META_MAPPING(fio->sbi), fio->old_blkaddr); if (mpage) { if (PageUptodate(mpage)) memcpy(page_address(mpage), page_address(fio->encrypted_page), PAGE_SIZE); f2fs_put_page(mpage, 1); } return 0; } static inline bool check_inplace_update_policy(struct inode *inode, struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); if (IS_F2FS_IPU_HONOR_OPU_WRITE(sbi) && is_inode_flag_set(inode, FI_OPU_WRITE)) return false; if (IS_F2FS_IPU_FORCE(sbi)) return true; if (IS_F2FS_IPU_SSR(sbi) && f2fs_need_SSR(sbi)) return true; if (IS_F2FS_IPU_UTIL(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util) return true; if (IS_F2FS_IPU_SSR_UTIL(sbi) && f2fs_need_SSR(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util) return true; /* * IPU for rewrite async pages */ if (IS_F2FS_IPU_ASYNC(sbi) && fio && fio->op == REQ_OP_WRITE && !(fio->op_flags & REQ_SYNC) && !IS_ENCRYPTED(inode)) return true; /* this is only set during fdatasync */ if (IS_F2FS_IPU_FSYNC(sbi) && is_inode_flag_set(inode, FI_NEED_IPU)) return true; if (unlikely(fio && is_sbi_flag_set(sbi, SBI_CP_DISABLED) && !f2fs_is_checkpointed_data(sbi, fio->old_blkaddr))) return true; return false; } bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) { /* swap file is migrating in aligned write mode */ if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) return false; if (f2fs_is_pinned_file(inode)) return true; /* if this is cold file, we should overwrite to avoid fragmentation */ if (file_is_cold(inode) && !is_inode_flag_set(inode, FI_OPU_WRITE)) return true; return check_inplace_update_policy(inode, fio); } bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); /* The below cases were checked when setting it. */ if (f2fs_is_pinned_file(inode)) return false; if (fio && is_sbi_flag_set(sbi, SBI_NEED_FSCK)) return true; if (f2fs_lfs_mode(sbi)) return true; if (S_ISDIR(inode->i_mode)) return true; if (IS_NOQUOTA(inode)) return true; if (f2fs_is_atomic_file(inode)) return true; /* rewrite low ratio compress data w/ OPU mode to avoid fragmentation */ if (f2fs_compressed_file(inode) && F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER && is_inode_flag_set(inode, FI_ENABLE_COMPRESS)) return true; /* swap file is migrating in aligned write mode */ if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) return true; if (is_inode_flag_set(inode, FI_OPU_WRITE)) return true; if (fio) { if (page_private_gcing(fio->page)) return true; if (page_private_dummy(fio->page)) return true; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) && f2fs_is_checkpointed_data(sbi, fio->old_blkaddr))) return true; } return false; } static inline bool need_inplace_update(struct f2fs_io_info *fio) { struct inode *inode = fio->page->mapping->host; if (f2fs_should_update_outplace(inode, fio)) return false; return f2fs_should_update_inplace(inode, fio); } int f2fs_do_write_data_page(struct f2fs_io_info *fio) { struct page *page = fio->page; struct inode *inode = page->mapping->host; struct dnode_of_data dn; struct node_info ni; bool ipu_force = false; int err = 0; /* Use COW inode to make dnode_of_data for atomic write */ if (f2fs_is_atomic_file(inode)) set_new_dnode(&dn, F2FS_I(inode)->cow_inode, NULL, NULL, 0); else set_new_dnode(&dn, inode, NULL, NULL, 0); if (need_inplace_update(fio) && f2fs_lookup_read_extent_cache_block(inode, page->index, &fio->old_blkaddr)) { if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, DATA_GENERIC_ENHANCE)) { f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; } ipu_force = true; fio->need_lock = LOCK_DONE; goto got_it; } /* Deadlock due to between page->lock and f2fs_lock_op */ if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi)) return -EAGAIN; err = f2fs_get_dnode_of_data(&dn, page->index, LOOKUP_NODE); if (err) goto out; fio->old_blkaddr = dn.data_blkaddr; /* This page is already truncated */ if (fio->old_blkaddr == NULL_ADDR) { ClearPageUptodate(page); clear_page_private_gcing(page); goto out_writepage; } got_it: if (__is_valid_data_blkaddr(fio->old_blkaddr) && !f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); goto out_writepage; } /* wait for GCed page writeback via META_MAPPING */ if (fio->post_read) f2fs_wait_on_block_writeback(inode, fio->old_blkaddr); /* * If current allocation needs SSR, * it had better in-place writes for updated data. */ if (ipu_force || (__is_valid_data_blkaddr(fio->old_blkaddr) && need_inplace_update(fio))) { err = f2fs_encrypt_one_page(fio); if (err) goto out_writepage; set_page_writeback(page); f2fs_put_dnode(&dn); if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); err = f2fs_inplace_write_data(fio); if (err) { if (fscrypt_inode_uses_fs_layer_crypto(inode)) fscrypt_finalize_bounce_page(&fio->encrypted_page); if (PageWriteback(page)) end_page_writeback(page); } else { set_inode_flag(inode, FI_UPDATE_WRITE); } trace_f2fs_do_write_data_page(fio->page, IPU); return err; } if (fio->need_lock == LOCK_RETRY) { if (!f2fs_trylock_op(fio->sbi)) { err = -EAGAIN; goto out_writepage; } fio->need_lock = LOCK_REQ; } err = f2fs_get_node_info(fio->sbi, dn.nid, &ni, false); if (err) goto out_writepage; fio->version = ni.version; err = f2fs_encrypt_one_page(fio); if (err) goto out_writepage; set_page_writeback(page); if (fio->compr_blocks && fio->old_blkaddr == COMPRESS_ADDR) f2fs_i_compr_blocks_update(inode, fio->compr_blocks - 1, false); /* LFS mode write path */ f2fs_outplace_write_data(&dn, fio); trace_f2fs_do_write_data_page(page, OPU); set_inode_flag(inode, FI_APPEND_WRITE); out_writepage: f2fs_put_dnode(&dn); out: if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); return err; } int f2fs_write_single_data_page(struct page *page, int *submitted, struct bio **bio, sector_t *last_block, struct writeback_control *wbc, enum iostat_type io_type, int compr_blocks, bool allow_balance) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); loff_t i_size = i_size_read(inode); const pgoff_t end_index = ((unsigned long long)i_size) >> PAGE_SHIFT; loff_t psize = (loff_t)(page->index + 1) << PAGE_SHIFT; unsigned offset = 0; bool need_balance_fs = false; bool quota_inode = IS_NOQUOTA(inode); int err = 0; struct f2fs_io_info fio = { .sbi = sbi, .ino = inode->i_ino, .type = DATA, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, .submitted = 0, .compr_blocks = compr_blocks, .need_lock = LOCK_RETRY, .post_read = f2fs_post_read_required(inode) ? 1 : 0, .io_type = io_type, .io_wbc = wbc, .bio = bio, .last_block = last_block, }; trace_f2fs_writepage(page, DATA); /* we should bypass data pages to proceed the kworker jobs */ if (unlikely(f2fs_cp_error(sbi))) { mapping_set_error(page->mapping, -EIO); /* * don't drop any dirty dentry pages for keeping lastest * directory structure. */ if (S_ISDIR(inode->i_mode) && !is_sbi_flag_set(sbi, SBI_IS_CLOSE)) goto redirty_out; /* keep data pages in remount-ro mode */ if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_READONLY) goto redirty_out; goto out; } if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; if (page->index < end_index || f2fs_verity_in_progress(inode) || compr_blocks) goto write; /* * If the offset is out-of-range of file size, * this page does not have to be written to disk. */ offset = i_size & (PAGE_SIZE - 1); if ((page->index >= end_index + 1) || !offset) goto out; zero_user_segment(page, offset, PAGE_SIZE); write: /* Dentry/quota blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode) || quota_inode) { /* * We need to wait for node_write to avoid block allocation during * checkpoint. This can only happen to quota writes which can cause * the below discard race condition. */ if (quota_inode) f2fs_down_read(&sbi->node_write); fio.need_lock = LOCK_DONE; err = f2fs_do_write_data_page(&fio); if (quota_inode) f2fs_up_read(&sbi->node_write); goto done; } if (!wbc->for_reclaim) need_balance_fs = true; else if (has_not_enough_free_secs(sbi, 0, 0)) goto redirty_out; else set_inode_flag(inode, FI_HOT_DATA); err = -EAGAIN; if (f2fs_has_inline_data(inode)) { err = f2fs_write_inline_data(inode, page); if (!err) goto out; } if (err == -EAGAIN) { err = f2fs_do_write_data_page(&fio); if (err == -EAGAIN) { fio.need_lock = LOCK_REQ; err = f2fs_do_write_data_page(&fio); } } if (err) { file_set_keep_isize(inode); } else { spin_lock(&F2FS_I(inode)->i_size_lock); if (F2FS_I(inode)->last_disk_size < psize) F2FS_I(inode)->last_disk_size = psize; spin_unlock(&F2FS_I(inode)->i_size_lock); } done: if (err && err != -ENOENT) goto redirty_out; out: inode_dec_dirty_pages(inode); if (err) { ClearPageUptodate(page); clear_page_private_gcing(page); } if (wbc->for_reclaim) { f2fs_submit_merged_write_cond(sbi, NULL, page, 0, DATA); clear_inode_flag(inode, FI_HOT_DATA); f2fs_remove_dirty_inode(inode); submitted = NULL; } unlock_page(page); if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) && !F2FS_I(inode)->wb_task && allow_balance) f2fs_balance_fs(sbi, need_balance_fs); if (unlikely(f2fs_cp_error(sbi))) { f2fs_submit_merged_write(sbi, DATA); if (bio && *bio) f2fs_submit_merged_ipu_write(sbi, bio, NULL); submitted = NULL; } if (submitted) *submitted = fio.submitted; return 0; redirty_out: redirty_page_for_writepage(wbc, page); /* * pageout() in MM translates EAGAIN, so calls handle_write_error() * -> mapping_set_error() -> set_bit(AS_EIO, ...). * file_write_and_wait_range() will see EIO error, which is critical * to return value of fsync() followed by atomic_write failure to user. */ if (!err || wbc->for_reclaim) return AOP_WRITEPAGE_ACTIVATE; unlock_page(page); return err; } static int f2fs_write_data_page(struct page *page, struct writeback_control *wbc) { #ifdef CONFIG_F2FS_FS_COMPRESSION struct inode *inode = page->mapping->host; if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) goto out; if (f2fs_compressed_file(inode)) { if (f2fs_is_compressed_cluster(inode, page->index)) { redirty_page_for_writepage(wbc, page); return AOP_WRITEPAGE_ACTIVATE; } } out: #endif return f2fs_write_single_data_page(page, NULL, NULL, NULL, wbc, FS_DATA_IO, 0, true); } /* * This function was copied from write_cache_pages from mm/page-writeback.c. * The major change is making write step of cold data page separately from * warm/hot data page. */ static int f2fs_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, enum iostat_type io_type) { int ret = 0; int done = 0, retry = 0; struct page *pages_local[F2FS_ONSTACK_PAGES]; struct page **pages = pages_local; struct folio_batch fbatch; struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); struct bio *bio = NULL; sector_t last_block; #ifdef CONFIG_F2FS_FS_COMPRESSION struct inode *inode = mapping->host; struct compress_ctx cc = { .inode = inode, .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, .cluster_size = F2FS_I(inode)->i_cluster_size, .cluster_idx = NULL_CLUSTER, .rpages = NULL, .nr_rpages = 0, .cpages = NULL, .valid_nr_cpages = 0, .rbuf = NULL, .cbuf = NULL, .rlen = PAGE_SIZE * F2FS_I(inode)->i_cluster_size, .private = NULL, }; #endif int nr_folios, p, idx; int nr_pages; unsigned int max_pages = F2FS_ONSTACK_PAGES; pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; int range_whole = 0; xa_mark_t tag; int nwritten = 0; int submitted = 0; int i; #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode) && 1 << cc.log_cluster_size > F2FS_ONSTACK_PAGES) { pages = f2fs_kzalloc(sbi, sizeof(struct page *) << cc.log_cluster_size, GFP_NOFS | __GFP_NOFAIL); max_pages = 1 << cc.log_cluster_size; } #endif folio_batch_init(&fbatch); if (get_dirty_pages(mapping->host) <= SM_I(F2FS_M_SB(mapping))->min_hot_blocks) set_inode_flag(mapping->host, FI_HOT_DATA); else clear_inode_flag(mapping->host, FI_HOT_DATA); if (wbc->range_cyclic) { index = mapping->writeback_index; /* prev offset */ end = -1; } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; } if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; retry: retry = 0; if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && !retry && (index <= end)) { nr_pages = 0; again: nr_folios = filemap_get_folios_tag(mapping, &index, end, tag, &fbatch); if (nr_folios == 0) { if (nr_pages) goto write; break; } for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; idx = 0; p = folio_nr_pages(folio); add_more: pages[nr_pages] = folio_page(folio, idx); folio_get(folio); if (++nr_pages == max_pages) { index = folio->index + idx + 1; folio_batch_release(&fbatch); goto write; } if (++idx < p) goto add_more; } folio_batch_release(&fbatch); goto again; write: for (i = 0; i < nr_pages; i++) { struct page *page = pages[i]; struct folio *folio = page_folio(page); bool need_readd; readd: need_readd = false; #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { void *fsdata = NULL; struct page *pagep; int ret2; ret = f2fs_init_compress_ctx(&cc); if (ret) { done = 1; break; } if (!f2fs_cluster_can_merge_page(&cc, folio->index)) { ret = f2fs_write_multi_pages(&cc, &submitted, wbc, io_type); if (!ret) need_readd = true; goto result; } if (unlikely(f2fs_cp_error(sbi))) goto lock_folio; if (!f2fs_cluster_is_empty(&cc)) goto lock_folio; if (f2fs_all_cluster_page_ready(&cc, pages, i, nr_pages, true)) goto lock_folio; ret2 = f2fs_prepare_compress_overwrite( inode, &pagep, folio->index, &fsdata); if (ret2 < 0) { ret = ret2; done = 1; break; } else if (ret2 && (!f2fs_compress_write_end(inode, fsdata, folio->index, 1) || !f2fs_all_cluster_page_ready(&cc, pages, i, nr_pages, false))) { retry = 1; break; } } #endif /* give a priority to WB_SYNC threads */ if (atomic_read(&sbi->wb_sync_req[DATA]) && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } #ifdef CONFIG_F2FS_FS_COMPRESSION lock_folio: #endif done_index = folio->index; retry_write: folio_lock(folio); if (unlikely(folio->mapping != mapping)) { continue_unlock: folio_unlock(folio); continue; } if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } if (folio_test_writeback(folio)) { if (wbc->sync_mode == WB_SYNC_NONE) goto continue_unlock; f2fs_wait_on_page_writeback(&folio->page, DATA, true, true); } if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { folio_get(folio); f2fs_compress_ctx_add_page(&cc, &folio->page); continue; } #endif ret = f2fs_write_single_data_page(&folio->page, &submitted, &bio, &last_block, wbc, io_type, 0, true); if (ret == AOP_WRITEPAGE_ACTIVATE) folio_unlock(folio); #ifdef CONFIG_F2FS_FS_COMPRESSION result: #endif nwritten += submitted; wbc->nr_to_write -= submitted; if (unlikely(ret)) { /* * keep nr_to_write, since vfs uses this to * get # of written pages. */ if (ret == AOP_WRITEPAGE_ACTIVATE) { ret = 0; goto next; } else if (ret == -EAGAIN) { ret = 0; if (wbc->sync_mode == WB_SYNC_ALL) { f2fs_io_schedule_timeout( DEFAULT_IO_TIMEOUT); goto retry_write; } goto next; } done_index = folio_next_index(folio); done = 1; break; } if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } next: if (need_readd) goto readd; } release_pages(pages, nr_pages); cond_resched(); } #ifdef CONFIG_F2FS_FS_COMPRESSION /* flush remained pages in compress cluster */ if (f2fs_compressed_file(inode) && !f2fs_cluster_is_empty(&cc)) { ret = f2fs_write_multi_pages(&cc, &submitted, wbc, io_type); nwritten += submitted; wbc->nr_to_write -= submitted; if (ret) { done = 1; retry = 0; } } if (f2fs_compressed_file(inode)) f2fs_destroy_compress_ctx(&cc, false); #endif if (retry) { index = 0; end = -1; goto retry; } if (wbc->range_cyclic && !done) done_index = 0; if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; if (nwritten) f2fs_submit_merged_write_cond(F2FS_M_SB(mapping), mapping->host, NULL, 0, DATA); /* submit cached bio of IPU write */ if (bio) f2fs_submit_merged_ipu_write(sbi, &bio, NULL); #ifdef CONFIG_F2FS_FS_COMPRESSION if (pages != pages_local) kfree(pages); #endif return ret; } static inline bool __should_serialize_io(struct inode *inode, struct writeback_control *wbc) { /* to avoid deadlock in path of data flush */ if (F2FS_I(inode)->wb_task) return false; if (!S_ISREG(inode->i_mode)) return false; if (IS_NOQUOTA(inode)) return false; if (f2fs_need_compress_data(inode)) return true; if (wbc->sync_mode != WB_SYNC_ALL) return true; if (get_dirty_pages(inode) >= SM_I(F2FS_I_SB(inode))->min_seq_blocks) return true; return false; } static int __f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc, enum iostat_type io_type) { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct blk_plug plug; int ret; bool locked = false; /* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) return 0; /* skip writing if there is no dirty page in this inode */ if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE) return 0; /* during POR, we don't need to trigger writepage at all. */ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto skip_write; if ((S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) && wbc->sync_mode == WB_SYNC_NONE && get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && f2fs_available_free_memory(sbi, DIRTY_DENTS)) goto skip_write; /* skip writing in file defragment preparing stage */ if (is_inode_flag_set(inode, FI_SKIP_WRITES)) goto skip_write; trace_f2fs_writepages(mapping->host, wbc, DATA); /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */ if (wbc->sync_mode == WB_SYNC_ALL) atomic_inc(&sbi->wb_sync_req[DATA]); else if (atomic_read(&sbi->wb_sync_req[DATA])) { /* to avoid potential deadlock */ if (current->plug) blk_finish_plug(current->plug); goto skip_write; } if (__should_serialize_io(inode, wbc)) { mutex_lock(&sbi->writepages); locked = true; } blk_start_plug(&plug); ret = f2fs_write_cache_pages(mapping, wbc, io_type); blk_finish_plug(&plug); if (locked) mutex_unlock(&sbi->writepages); if (wbc->sync_mode == WB_SYNC_ALL) atomic_dec(&sbi->wb_sync_req[DATA]); /* * if some pages were truncated, we cannot guarantee its mapping->host * to detect pending bios. */ f2fs_remove_dirty_inode(inode); return ret; skip_write: wbc->pages_skipped += get_dirty_pages(inode); trace_f2fs_writepages(mapping->host, wbc, DATA); return 0; } static int f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; return __f2fs_write_data_pages(mapping, wbc, F2FS_I(inode)->cp_task == current ? FS_CP_DATA_IO : FS_DATA_IO); } void f2fs_write_failed(struct inode *inode, loff_t to) { loff_t i_size = i_size_read(inode); if (IS_NOQUOTA(inode)) return; /* In the fs-verity case, f2fs_end_enable_verity() does the truncate */ if (to > i_size && !f2fs_verity_in_progress(inode)) { f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); truncate_pagecache(inode, i_size); f2fs_truncate_blocks(inode, i_size, true); filemap_invalidate_unlock(inode->i_mapping); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } static int prepare_write_begin(struct f2fs_sb_info *sbi, struct page *page, loff_t pos, unsigned len, block_t *blk_addr, bool *node_changed) { struct inode *inode = page->mapping->host; pgoff_t index = page->index; struct dnode_of_data dn; struct page *ipage; bool locked = false; int flag = F2FS_GET_BLOCK_PRE_AIO; int err = 0; /* * If a whole page is being written and we already preallocated all the * blocks, then there is no need to get a block address now. */ if (len == PAGE_SIZE && is_inode_flag_set(inode, FI_PREALLOCATED_ALL)) return 0; /* f2fs_lock_op avoids race between write CP and convert_inline_page */ if (f2fs_has_inline_data(inode)) { if (pos + len > MAX_INLINE_DATA(inode)) flag = F2FS_GET_BLOCK_DEFAULT; f2fs_map_lock(sbi, flag); locked = true; } else if ((pos & PAGE_MASK) >= i_size_read(inode)) { f2fs_map_lock(sbi, flag); locked = true; } restart: /* check inline_data */ ipage = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto unlock_out; } set_new_dnode(&dn, inode, ipage, ipage, 0); if (f2fs_has_inline_data(inode)) { if (pos + len <= MAX_INLINE_DATA(inode)) { f2fs_do_read_inline_data(page, ipage); set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) set_page_private_inline(ipage); goto out; } err = f2fs_convert_inline_page(&dn, page); if (err || dn.data_blkaddr != NULL_ADDR) goto out; } if (!f2fs_lookup_read_extent_cache_block(inode, index, &dn.data_blkaddr)) { if (locked) { err = f2fs_reserve_block(&dn, index); goto out; } /* hole case */ err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); if (!err && dn.data_blkaddr != NULL_ADDR) goto out; f2fs_put_dnode(&dn); f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO); locked = true; goto restart; } out: if (!err) { /* convert_inline_page can make node_changed */ *blk_addr = dn.data_blkaddr; *node_changed = dn.node_changed; } f2fs_put_dnode(&dn); unlock_out: if (locked) f2fs_map_unlock(sbi, flag); return err; } static int __find_data_block(struct inode *inode, pgoff_t index, block_t *blk_addr) { struct dnode_of_data dn; struct page *ipage; int err = 0; ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); set_new_dnode(&dn, inode, ipage, ipage, 0); if (!f2fs_lookup_read_extent_cache_block(inode, index, &dn.data_blkaddr)) { /* hole case */ err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err) { dn.data_blkaddr = NULL_ADDR; err = 0; } } *blk_addr = dn.data_blkaddr; f2fs_put_dnode(&dn); return err; } static int __reserve_data_block(struct inode *inode, pgoff_t index, block_t *blk_addr, bool *node_changed) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; struct page *ipage; int err = 0; f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); ipage = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto unlock_out; } set_new_dnode(&dn, inode, ipage, ipage, 0); if (!f2fs_lookup_read_extent_cache_block(dn.inode, index, &dn.data_blkaddr)) err = f2fs_reserve_block(&dn, index); *blk_addr = dn.data_blkaddr; *node_changed = dn.node_changed; f2fs_put_dnode(&dn); unlock_out: f2fs_map_unlock(sbi, F2FS_GET_BLOCK_PRE_AIO); return err; } static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi, struct page *page, loff_t pos, unsigned int len, block_t *blk_addr, bool *node_changed, bool *use_cow) { struct inode *inode = page->mapping->host; struct inode *cow_inode = F2FS_I(inode)->cow_inode; pgoff_t index = page->index; int err = 0; block_t ori_blk_addr = NULL_ADDR; /* If pos is beyond the end of file, reserve a new block in COW inode */ if ((pos & PAGE_MASK) >= i_size_read(inode)) goto reserve_block; /* Look for the block in COW inode first */ err = __find_data_block(cow_inode, index, blk_addr); if (err) { return err; } else if (*blk_addr != NULL_ADDR) { *use_cow = true; return 0; } if (is_inode_flag_set(inode, FI_ATOMIC_REPLACE)) goto reserve_block; /* Look for the block in the original inode */ err = __find_data_block(inode, index, &ori_blk_addr); if (err) return err; reserve_block: /* Finally, we should reserve a new block in COW inode for the update */ err = __reserve_data_block(cow_inode, index, blk_addr, node_changed); if (err) return err; inc_atomic_write_cnt(inode); if (ori_blk_addr != NULL_ADDR) *blk_addr = ori_blk_addr; return 0; } static int f2fs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page = NULL; pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT; bool need_balance = false; bool use_cow = false; block_t blkaddr = NULL_ADDR; int err = 0; trace_f2fs_write_begin(inode, pos, len); if (!f2fs_is_checkpoint_ready(sbi)) { err = -ENOSPC; goto fail; } /* * We should check this at this moment to avoid deadlock on inode page * and #0 page. The locking rule for inline_data conversion should be: * lock_page(page #0) -> lock_page(inode_page) */ if (index != 0) { err = f2fs_convert_inline_inode(inode); if (err) goto fail; } #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { int ret; *fsdata = NULL; if (len == PAGE_SIZE && !(f2fs_is_atomic_file(inode))) goto repeat; ret = f2fs_prepare_compress_overwrite(inode, pagep, index, fsdata); if (ret < 0) { err = ret; goto fail; } else if (ret) { return 0; } } #endif repeat: /* * Do not use grab_cache_page_write_begin() to avoid deadlock due to * wait_for_stable_page. Will wait that below with our IO control. */ page = f2fs_pagecache_get_page(mapping, index, FGP_LOCK | FGP_WRITE | FGP_CREAT, GFP_NOFS); if (!page) { err = -ENOMEM; goto fail; } /* TODO: cluster can be compressed due to race with .writepage */ *pagep = page; if (f2fs_is_atomic_file(inode)) err = prepare_atomic_write_begin(sbi, page, pos, len, &blkaddr, &need_balance, &use_cow); else err = prepare_write_begin(sbi, page, pos, len, &blkaddr, &need_balance); if (err) goto fail; if (need_balance && !IS_NOQUOTA(inode) && has_not_enough_free_secs(sbi, 0, 0)) { unlock_page(page); f2fs_balance_fs(sbi, true); lock_page(page); if (page->mapping != mapping) { /* The page got truncated from under us */ f2fs_put_page(page, 1); goto repeat; } } f2fs_wait_on_page_writeback(page, DATA, false, true); if (len == PAGE_SIZE || PageUptodate(page)) return 0; if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode) && !f2fs_verity_in_progress(inode)) { zero_user_segment(page, len, PAGE_SIZE); return 0; } if (blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_SIZE); SetPageUptodate(page); } else { if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto fail; } err = f2fs_submit_page_read(use_cow ? F2FS_I(inode)->cow_inode : inode, page, blkaddr, 0, true); if (err) goto fail; lock_page(page); if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } if (unlikely(!PageUptodate(page))) { err = -EIO; goto fail; } } return 0; fail: f2fs_put_page(page, 1); f2fs_write_failed(inode, pos + len); return err; } static int f2fs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { struct inode *inode = page->mapping->host; trace_f2fs_write_end(inode, pos, len, copied); /* * This should be come from len == PAGE_SIZE, and we expect copied * should be PAGE_SIZE. Otherwise, we treat it with zero copied and * let generic_perform_write() try to copy data again through copied=0. */ if (!PageUptodate(page)) { if (unlikely(copied != len)) copied = 0; else SetPageUptodate(page); } #ifdef CONFIG_F2FS_FS_COMPRESSION /* overwrite compressed file */ if (f2fs_compressed_file(inode) && fsdata) { f2fs_compress_write_end(inode, fsdata, page->index, copied); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); if (pos + copied > i_size_read(inode) && !f2fs_verity_in_progress(inode)) f2fs_i_size_write(inode, pos + copied); return copied; } #endif if (!copied) goto unlock_out; set_page_dirty(page); if (pos + copied > i_size_read(inode) && !f2fs_verity_in_progress(inode)) { f2fs_i_size_write(inode, pos + copied); if (f2fs_is_atomic_file(inode)) f2fs_i_size_write(F2FS_I(inode)->cow_inode, pos + copied); } unlock_out: f2fs_put_page(page, 1); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return copied; } void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length) { struct inode *inode = folio->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); if (inode->i_ino >= F2FS_ROOT_INO(sbi) && (offset || length != folio_size(folio))) return; if (folio_test_dirty(folio)) { if (inode->i_ino == F2FS_META_INO(sbi)) { dec_page_count(sbi, F2FS_DIRTY_META); } else if (inode->i_ino == F2FS_NODE_INO(sbi)) { dec_page_count(sbi, F2FS_DIRTY_NODES); } else { inode_dec_dirty_pages(inode); f2fs_remove_dirty_inode(inode); } } clear_page_private_all(&folio->page); } bool f2fs_release_folio(struct folio *folio, gfp_t wait) { /* If this is dirty folio, keep private data */ if (folio_test_dirty(folio)) return false; clear_page_private_all(&folio->page); return true; } static bool f2fs_dirty_data_folio(struct address_space *mapping, struct folio *folio) { struct inode *inode = mapping->host; trace_f2fs_set_page_dirty(&folio->page, DATA); if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); BUG_ON(folio_test_swapcache(folio)); if (filemap_dirty_folio(mapping, folio)) { f2fs_update_dirty_folio(inode, folio); return true; } return false; } static sector_t f2fs_bmap_compress(struct inode *inode, sector_t block) { #ifdef CONFIG_F2FS_FS_COMPRESSION struct dnode_of_data dn; sector_t start_idx, blknr = 0; int ret; start_idx = round_down(block, F2FS_I(inode)->i_cluster_size); set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); if (ret) return 0; if (dn.data_blkaddr != COMPRESS_ADDR) { dn.ofs_in_node += block - start_idx; blknr = f2fs_data_blkaddr(&dn); if (!__is_valid_data_blkaddr(blknr)) blknr = 0; } f2fs_put_dnode(&dn); return blknr; #else return 0; #endif } static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; sector_t blknr = 0; if (f2fs_has_inline_data(inode)) goto out; /* make sure allocating whole blocks */ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) filemap_write_and_wait(mapping); /* Block number less than F2FS MAX BLOCKS */ if (unlikely(block >= max_file_blocks(inode))) goto out; if (f2fs_compressed_file(inode)) { blknr = f2fs_bmap_compress(inode, block); } else { struct f2fs_map_blocks map; memset(&map, 0, sizeof(map)); map.m_lblk = block; map.m_len = 1; map.m_next_pgofs = NULL; map.m_seg_type = NO_CHECK_TYPE; if (!f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_BMAP)) blknr = map.m_pblk; } out: trace_f2fs_bmap(inode, block, blknr); return blknr; } #ifdef CONFIG_SWAP static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, unsigned int blkcnt) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int blkofs; unsigned int blk_per_sec = BLKS_PER_SEC(sbi); unsigned int secidx = start_blk / blk_per_sec; unsigned int end_sec = secidx + blkcnt / blk_per_sec; int ret = 0; f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); set_inode_flag(inode, FI_ALIGNED_WRITE); set_inode_flag(inode, FI_OPU_WRITE); for (; secidx < end_sec; secidx++) { f2fs_down_write(&sbi->pin_sem); f2fs_lock_op(sbi); f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); f2fs_unlock_op(sbi); set_inode_flag(inode, FI_SKIP_WRITES); for (blkofs = 0; blkofs < blk_per_sec; blkofs++) { struct page *page; unsigned int blkidx = secidx * blk_per_sec + blkofs; page = f2fs_get_lock_data_page(inode, blkidx, true); if (IS_ERR(page)) { f2fs_up_write(&sbi->pin_sem); ret = PTR_ERR(page); goto done; } set_page_dirty(page); f2fs_put_page(page, 1); } clear_inode_flag(inode, FI_SKIP_WRITES); ret = filemap_fdatawrite(inode->i_mapping); f2fs_up_write(&sbi->pin_sem); if (ret) break; } done: clear_inode_flag(inode, FI_SKIP_WRITES); clear_inode_flag(inode, FI_OPU_WRITE); clear_inode_flag(inode, FI_ALIGNED_WRITE); filemap_invalidate_unlock(inode->i_mapping); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } static int check_swap_activate(struct swap_info_struct *sis, struct file *swap_file, sector_t *span) { struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); sector_t cur_lblock; sector_t last_lblock; sector_t pblock; sector_t lowest_pblock = -1; sector_t highest_pblock = 0; int nr_extents = 0; unsigned long nr_pblocks; unsigned int blks_per_sec = BLKS_PER_SEC(sbi); unsigned int sec_blks_mask = BLKS_PER_SEC(sbi) - 1; unsigned int not_aligned = 0; int ret = 0; /* * Map all the blocks into the extent list. This code doesn't try * to be very smart. */ cur_lblock = 0; last_lblock = bytes_to_blks(inode, i_size_read(inode)); while (cur_lblock < last_lblock && cur_lblock < sis->max) { struct f2fs_map_blocks map; retry: cond_resched(); memset(&map, 0, sizeof(map)); map.m_lblk = cur_lblock; map.m_len = last_lblock - cur_lblock; map.m_next_pgofs = NULL; map.m_next_extent = NULL; map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_FIEMAP); if (ret) goto out; /* hole */ if (!(map.m_flags & F2FS_MAP_FLAGS)) { f2fs_err(sbi, "Swapfile has holes"); ret = -EINVAL; goto out; } pblock = map.m_pblk; nr_pblocks = map.m_len; if ((pblock - SM_I(sbi)->main_blkaddr) & sec_blks_mask || nr_pblocks & sec_blks_mask) { not_aligned++; nr_pblocks = roundup(nr_pblocks, blks_per_sec); if (cur_lblock + nr_pblocks > sis->max) nr_pblocks -= blks_per_sec; if (!nr_pblocks) { /* this extent is last one */ nr_pblocks = map.m_len; f2fs_warn(sbi, "Swapfile: last extent is not aligned to section"); goto next; } ret = f2fs_migrate_blocks(inode, cur_lblock, nr_pblocks); if (ret) goto out; goto retry; } next: if (cur_lblock + nr_pblocks >= sis->max) nr_pblocks = sis->max - cur_lblock; if (cur_lblock) { /* exclude the header page */ if (pblock < lowest_pblock) lowest_pblock = pblock; if (pblock + nr_pblocks - 1 > highest_pblock) highest_pblock = pblock + nr_pblocks - 1; } /* * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks */ ret = add_swap_extent(sis, cur_lblock, nr_pblocks, pblock); if (ret < 0) goto out; nr_extents += ret; cur_lblock += nr_pblocks; } ret = nr_extents; *span = 1 + highest_pblock - lowest_pblock; if (cur_lblock == 0) cur_lblock = 1; /* force Empty message */ sis->max = cur_lblock; sis->pages = cur_lblock - 1; sis->highest_bit = cur_lblock - 1; out: if (not_aligned) f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%lu * N)", not_aligned, blks_per_sec * F2FS_BLKSIZE); return ret; } static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { struct inode *inode = file_inode(file); int ret; if (!S_ISREG(inode->i_mode)) return -EINVAL; if (f2fs_readonly(F2FS_I_SB(inode)->sb)) return -EROFS; if (f2fs_lfs_mode(F2FS_I_SB(inode))) { f2fs_err(F2FS_I_SB(inode), "Swapfile not supported in LFS mode"); return -EINVAL; } ret = f2fs_convert_inline_inode(inode); if (ret) return ret; if (!f2fs_disable_compressed_file(inode)) return -EINVAL; f2fs_precache_extents(inode); ret = check_swap_activate(sis, file, span); if (ret < 0) return ret; stat_inc_swapfile_inode(inode); set_inode_flag(inode, FI_PIN_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; } static void f2fs_swap_deactivate(struct file *file) { struct inode *inode = file_inode(file); stat_dec_swapfile_inode(inode); clear_inode_flag(inode, FI_PIN_FILE); } #else static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { return -EOPNOTSUPP; } static void f2fs_swap_deactivate(struct file *file) { } #endif const struct address_space_operations f2fs_dblock_aops = { .read_folio = f2fs_read_data_folio, .readahead = f2fs_readahead, .writepage = f2fs_write_data_page, .writepages = f2fs_write_data_pages, .write_begin = f2fs_write_begin, .write_end = f2fs_write_end, .dirty_folio = f2fs_dirty_data_folio, .migrate_folio = filemap_migrate_folio, .invalidate_folio = f2fs_invalidate_folio, .release_folio = f2fs_release_folio, .bmap = f2fs_bmap, .swap_activate = f2fs_swap_activate, .swap_deactivate = f2fs_swap_deactivate, }; void f2fs_clear_page_cache_dirty_tag(struct page *page) { struct address_space *mapping = page_mapping(page); unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); __xa_clear_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); xa_unlock_irqrestore(&mapping->i_pages, flags); } int __init f2fs_init_post_read_processing(void) { bio_post_read_ctx_cache = kmem_cache_create("f2fs_bio_post_read_ctx", sizeof(struct bio_post_read_ctx), 0, 0, NULL); if (!bio_post_read_ctx_cache) goto fail; bio_post_read_ctx_pool = mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS, bio_post_read_ctx_cache); if (!bio_post_read_ctx_pool) goto fail_free_cache; return 0; fail_free_cache: kmem_cache_destroy(bio_post_read_ctx_cache); fail: return -ENOMEM; } void f2fs_destroy_post_read_processing(void) { mempool_destroy(bio_post_read_ctx_pool); kmem_cache_destroy(bio_post_read_ctx_cache); } int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi) { if (!f2fs_sb_has_encrypt(sbi) && !f2fs_sb_has_verity(sbi) && !f2fs_sb_has_compression(sbi)) return 0; sbi->post_read_wq = alloc_workqueue("f2fs_post_read_wq", WQ_UNBOUND | WQ_HIGHPRI, num_online_cpus()); return sbi->post_read_wq ? 0 : -ENOMEM; } void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi) { if (sbi->post_read_wq) destroy_workqueue(sbi->post_read_wq); } int __init f2fs_init_bio_entry_cache(void) { bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab", sizeof(struct bio_entry)); return bio_entry_slab ? 0 : -ENOMEM; } void f2fs_destroy_bio_entry_cache(void) { kmem_cache_destroy(bio_entry_slab); } static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { struct f2fs_map_blocks map = {}; pgoff_t next_pgofs = 0; int err; map.m_lblk = bytes_to_blks(inode, offset); map.m_len = bytes_to_blks(inode, offset + length - 1) - map.m_lblk + 1; map.m_next_pgofs = &next_pgofs; map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint); if (flags & IOMAP_WRITE) map.m_may_create = true; err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DIO); if (err) return err; iomap->offset = blks_to_bytes(inode, map.m_lblk); /* * When inline encryption is enabled, sometimes I/O to an encrypted file * has to be broken up to guarantee DUN contiguity. Handle this by * limiting the length of the mapping returned. */ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); /* * We should never see delalloc or compressed extents here based on * prior flushing and checks. */ if (WARN_ON_ONCE(map.m_pblk == NEW_ADDR)) return -EINVAL; if (WARN_ON_ONCE(map.m_pblk == COMPRESS_ADDR)) return -EINVAL; if (map.m_pblk != NULL_ADDR) { iomap->length = blks_to_bytes(inode, map.m_len); iomap->type = IOMAP_MAPPED; iomap->flags |= IOMAP_F_MERGED; iomap->bdev = map.m_bdev; iomap->addr = blks_to_bytes(inode, map.m_pblk); } else { if (flags & IOMAP_WRITE) return -ENOTBLK; iomap->length = blks_to_bytes(inode, next_pgofs) - iomap->offset; iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; } if (map.m_flags & F2FS_MAP_NEW) iomap->flags |= IOMAP_F_NEW; if ((inode->i_state & I_DIRTY_DATASYNC) || offset + length > i_size_read(inode)) iomap->flags |= IOMAP_F_DIRTY; return 0; } const struct iomap_ops f2fs_iomap_ops = { .iomap_begin = f2fs_iomap_begin, };
79 2 2 77 51 1 50 36 42 39 1 38 85 3 1 29 2 2 51 2 2 4 7 17 48 1 39 16 85 31 1 55 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 // SPDX-License-Identifier: GPL-2.0-or-later /* Crypto operations using stored keys * * Copyright (c) 2016, Intel Corporation */ #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/scatterlist.h> #include <linux/crypto.h> #include <crypto/hash.h> #include <crypto/kpp.h> #include <crypto/dh.h> #include <crypto/kdf_sp800108.h> #include <keys/user-type.h> #include "internal.h" static ssize_t dh_data_from_key(key_serial_t keyid, const void **data) { struct key *key; key_ref_t key_ref; long status; ssize_t ret; key_ref = lookup_user_key(keyid, 0, KEY_NEED_READ); if (IS_ERR(key_ref)) { ret = -ENOKEY; goto error; } key = key_ref_to_ptr(key_ref); ret = -EOPNOTSUPP; if (key->type == &key_type_user) { down_read(&key->sem); status = key_validate(key); if (status == 0) { const struct user_key_payload *payload; uint8_t *duplicate; payload = user_key_payload_locked(key); duplicate = kmemdup(payload->data, payload->datalen, GFP_KERNEL); if (duplicate) { *data = duplicate; ret = payload->datalen; } else { ret = -ENOMEM; } } up_read(&key->sem); } key_put(key); error: return ret; } static void dh_free_data(struct dh *dh) { kfree_sensitive(dh->key); kfree_sensitive(dh->p); kfree_sensitive(dh->g); } static int kdf_alloc(struct crypto_shash **hash, char *hashname) { struct crypto_shash *tfm; /* allocate synchronous hash */ tfm = crypto_alloc_shash(hashname, 0, 0); if (IS_ERR(tfm)) { pr_info("could not allocate digest TFM handle %s\n", hashname); return PTR_ERR(tfm); } if (crypto_shash_digestsize(tfm) == 0) { crypto_free_shash(tfm); return -EINVAL; } *hash = tfm; return 0; } static void kdf_dealloc(struct crypto_shash *hash) { if (hash) crypto_free_shash(hash); } static int keyctl_dh_compute_kdf(struct crypto_shash *hash, char __user *buffer, size_t buflen, uint8_t *kbuf, size_t kbuflen) { struct kvec kbuf_iov = { .iov_base = kbuf, .iov_len = kbuflen }; uint8_t *outbuf = NULL; int ret; size_t outbuf_len = roundup(buflen, crypto_shash_digestsize(hash)); outbuf = kmalloc(outbuf_len, GFP_KERNEL); if (!outbuf) { ret = -ENOMEM; goto err; } ret = crypto_kdf108_ctr_generate(hash, &kbuf_iov, 1, outbuf, outbuf_len); if (ret) goto err; ret = buflen; if (copy_to_user(buffer, outbuf, buflen) != 0) ret = -EFAULT; err: kfree_sensitive(outbuf); return ret; } long __keyctl_dh_compute(struct keyctl_dh_params __user *params, char __user *buffer, size_t buflen, struct keyctl_kdf_params *kdfcopy) { long ret; ssize_t dlen; int secretlen; int outlen; struct keyctl_dh_params pcopy; struct dh dh_inputs; struct scatterlist outsg; DECLARE_CRYPTO_WAIT(compl); struct crypto_kpp *tfm; struct kpp_request *req; uint8_t *secret; uint8_t *outbuf; struct crypto_shash *hash = NULL; if (!params || (!buffer && buflen)) { ret = -EINVAL; goto out1; } if (copy_from_user(&pcopy, params, sizeof(pcopy)) != 0) { ret = -EFAULT; goto out1; } if (kdfcopy) { char *hashname; if (memchr_inv(kdfcopy->__spare, 0, sizeof(kdfcopy->__spare))) { ret = -EINVAL; goto out1; } if (buflen > KEYCTL_KDF_MAX_OUTPUT_LEN || kdfcopy->otherinfolen > KEYCTL_KDF_MAX_OI_LEN) { ret = -EMSGSIZE; goto out1; } /* get KDF name string */ hashname = strndup_user(kdfcopy->hashname, CRYPTO_MAX_ALG_NAME); if (IS_ERR(hashname)) { ret = PTR_ERR(hashname); goto out1; } /* allocate KDF from the kernel crypto API */ ret = kdf_alloc(&hash, hashname); kfree(hashname); if (ret) goto out1; } memset(&dh_inputs, 0, sizeof(dh_inputs)); dlen = dh_data_from_key(pcopy.prime, &dh_inputs.p); if (dlen < 0) { ret = dlen; goto out1; } dh_inputs.p_size = dlen; dlen = dh_data_from_key(pcopy.base, &dh_inputs.g); if (dlen < 0) { ret = dlen; goto out2; } dh_inputs.g_size = dlen; dlen = dh_data_from_key(pcopy.private, &dh_inputs.key); if (dlen < 0) { ret = dlen; goto out2; } dh_inputs.key_size = dlen; secretlen = crypto_dh_key_len(&dh_inputs); secret = kmalloc(secretlen, GFP_KERNEL); if (!secret) { ret = -ENOMEM; goto out2; } ret = crypto_dh_encode_key(secret, secretlen, &dh_inputs); if (ret) goto out3; tfm = crypto_alloc_kpp("dh", 0, 0); if (IS_ERR(tfm)) { ret = PTR_ERR(tfm); goto out3; } ret = crypto_kpp_set_secret(tfm, secret, secretlen); if (ret) goto out4; outlen = crypto_kpp_maxsize(tfm); if (!kdfcopy) { /* * When not using a KDF, buflen 0 is used to read the * required buffer length */ if (buflen == 0) { ret = outlen; goto out4; } else if (outlen > buflen) { ret = -EOVERFLOW; goto out4; } } outbuf = kzalloc(kdfcopy ? (outlen + kdfcopy->otherinfolen) : outlen, GFP_KERNEL); if (!outbuf) { ret = -ENOMEM; goto out4; } sg_init_one(&outsg, outbuf, outlen); req = kpp_request_alloc(tfm, GFP_KERNEL); if (!req) { ret = -ENOMEM; goto out5; } kpp_request_set_input(req, NULL, 0); kpp_request_set_output(req, &outsg, outlen); kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &compl); /* * For DH, generate_public_key and generate_shared_secret are * the same calculation */ ret = crypto_kpp_generate_public_key(req); ret = crypto_wait_req(ret, &compl); if (ret) goto out6; if (kdfcopy) { /* * Concatenate SP800-56A otherinfo past DH shared secret -- the * input to the KDF is (DH shared secret || otherinfo) */ if (copy_from_user(outbuf + req->dst_len, kdfcopy->otherinfo, kdfcopy->otherinfolen) != 0) { ret = -EFAULT; goto out6; } ret = keyctl_dh_compute_kdf(hash, buffer, buflen, outbuf, req->dst_len + kdfcopy->otherinfolen); } else if (copy_to_user(buffer, outbuf, req->dst_len) == 0) { ret = req->dst_len; } else { ret = -EFAULT; } out6: kpp_request_free(req); out5: kfree_sensitive(outbuf); out4: crypto_free_kpp(tfm); out3: kfree_sensitive(secret); out2: dh_free_data(&dh_inputs); out1: kdf_dealloc(hash); return ret; } long keyctl_dh_compute(struct keyctl_dh_params __user *params, char __user *buffer, size_t buflen, struct keyctl_kdf_params __user *kdf) { struct keyctl_kdf_params kdfcopy; if (!kdf) return __keyctl_dh_compute(params, buffer, buflen, NULL); if (copy_from_user(&kdfcopy, kdf, sizeof(kdfcopy)) != 0) return -EFAULT; return __keyctl_dh_compute(params, buffer, buflen, &kdfcopy); }
1 1 2 146 72 2 3 67 2 68 1 69 70 84 81 1 83 18 1 11 8 18 1 1 18 1 1 47 47 44 44 44 47 1 47 1 47 46 47 12 12 12 6 6 12 1 1 76 76 34 33 12 11 12 3 99 33 14 3 38 31 2 70 39 9 9 9 9 2 9 9 3 6 9 9 147 96 5 135 108 147 147 108 147 147 88 147 5 1 1 1 89 1 88 89 44 89 88 47 79 1 1 1 1 1 1 1 43 43 73 1 72 60 38 39 2 1 2 34 3 3 3 1 2 70 1 2 67 68 3 65 4 4 1 1 9 9 1 9 9 9 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 // SPDX-License-Identifier: GPL-2.0 /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * */ #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/nls.h> #include "debug.h" #include "ntfs.h" #include "ntfs_fs.h" // clang-format off const struct cpu_str NAME_MFT = { 4, 0, { '$', 'M', 'F', 'T' }, }; const struct cpu_str NAME_MIRROR = { 8, 0, { '$', 'M', 'F', 'T', 'M', 'i', 'r', 'r' }, }; const struct cpu_str NAME_LOGFILE = { 8, 0, { '$', 'L', 'o', 'g', 'F', 'i', 'l', 'e' }, }; const struct cpu_str NAME_VOLUME = { 7, 0, { '$', 'V', 'o', 'l', 'u', 'm', 'e' }, }; const struct cpu_str NAME_ATTRDEF = { 8, 0, { '$', 'A', 't', 't', 'r', 'D', 'e', 'f' }, }; const struct cpu_str NAME_ROOT = { 1, 0, { '.' }, }; const struct cpu_str NAME_BITMAP = { 7, 0, { '$', 'B', 'i', 't', 'm', 'a', 'p' }, }; const struct cpu_str NAME_BOOT = { 5, 0, { '$', 'B', 'o', 'o', 't' }, }; const struct cpu_str NAME_BADCLUS = { 8, 0, { '$', 'B', 'a', 'd', 'C', 'l', 'u', 's' }, }; const struct cpu_str NAME_QUOTA = { 6, 0, { '$', 'Q', 'u', 'o', 't', 'a' }, }; const struct cpu_str NAME_SECURE = { 7, 0, { '$', 'S', 'e', 'c', 'u', 'r', 'e' }, }; const struct cpu_str NAME_UPCASE = { 7, 0, { '$', 'U', 'p', 'C', 'a', 's', 'e' }, }; const struct cpu_str NAME_EXTEND = { 7, 0, { '$', 'E', 'x', 't', 'e', 'n', 'd' }, }; const struct cpu_str NAME_OBJID = { 6, 0, { '$', 'O', 'b', 'j', 'I', 'd' }, }; const struct cpu_str NAME_REPARSE = { 8, 0, { '$', 'R', 'e', 'p', 'a', 'r', 's', 'e' }, }; const struct cpu_str NAME_USNJRNL = { 8, 0, { '$', 'U', 's', 'n', 'J', 'r', 'n', 'l' }, }; const __le16 BAD_NAME[4] = { cpu_to_le16('$'), cpu_to_le16('B'), cpu_to_le16('a'), cpu_to_le16('d'), }; const __le16 I30_NAME[4] = { cpu_to_le16('$'), cpu_to_le16('I'), cpu_to_le16('3'), cpu_to_le16('0'), }; const __le16 SII_NAME[4] = { cpu_to_le16('$'), cpu_to_le16('S'), cpu_to_le16('I'), cpu_to_le16('I'), }; const __le16 SDH_NAME[4] = { cpu_to_le16('$'), cpu_to_le16('S'), cpu_to_le16('D'), cpu_to_le16('H'), }; const __le16 SDS_NAME[4] = { cpu_to_le16('$'), cpu_to_le16('S'), cpu_to_le16('D'), cpu_to_le16('S'), }; const __le16 SO_NAME[2] = { cpu_to_le16('$'), cpu_to_le16('O'), }; const __le16 SQ_NAME[2] = { cpu_to_le16('$'), cpu_to_le16('Q'), }; const __le16 SR_NAME[2] = { cpu_to_le16('$'), cpu_to_le16('R'), }; #ifdef CONFIG_NTFS3_LZX_XPRESS const __le16 WOF_NAME[17] = { cpu_to_le16('W'), cpu_to_le16('o'), cpu_to_le16('f'), cpu_to_le16('C'), cpu_to_le16('o'), cpu_to_le16('m'), cpu_to_le16('p'), cpu_to_le16('r'), cpu_to_le16('e'), cpu_to_le16('s'), cpu_to_le16('s'), cpu_to_le16('e'), cpu_to_le16('d'), cpu_to_le16('D'), cpu_to_le16('a'), cpu_to_le16('t'), cpu_to_le16('a'), }; #endif static const __le16 CON_NAME[3] = { cpu_to_le16('C'), cpu_to_le16('O'), cpu_to_le16('N'), }; static const __le16 NUL_NAME[3] = { cpu_to_le16('N'), cpu_to_le16('U'), cpu_to_le16('L'), }; static const __le16 AUX_NAME[3] = { cpu_to_le16('A'), cpu_to_le16('U'), cpu_to_le16('X'), }; static const __le16 PRN_NAME[3] = { cpu_to_le16('P'), cpu_to_le16('R'), cpu_to_le16('N'), }; static const __le16 COM_NAME[3] = { cpu_to_le16('C'), cpu_to_le16('O'), cpu_to_le16('M'), }; static const __le16 LPT_NAME[3] = { cpu_to_le16('L'), cpu_to_le16('P'), cpu_to_le16('T'), }; // clang-format on /* * ntfs_fix_pre_write - Insert fixups into @rhdr before writing to disk. */ bool ntfs_fix_pre_write(struct NTFS_RECORD_HEADER *rhdr, size_t bytes) { u16 *fixup, *ptr; u16 sample; u16 fo = le16_to_cpu(rhdr->fix_off); u16 fn = le16_to_cpu(rhdr->fix_num); if ((fo & 1) || fo + fn * sizeof(short) > SECTOR_SIZE || !fn-- || fn * SECTOR_SIZE > bytes) { return false; } /* Get fixup pointer. */ fixup = Add2Ptr(rhdr, fo); if (*fixup >= 0x7FFF) *fixup = 1; else *fixup += 1; sample = *fixup; ptr = Add2Ptr(rhdr, SECTOR_SIZE - sizeof(short)); while (fn--) { *++fixup = *ptr; *ptr = sample; ptr += SECTOR_SIZE / sizeof(short); } return true; } /* * ntfs_fix_post_read - Remove fixups after reading from disk. * * Return: < 0 if error, 0 if ok, 1 if need to update fixups. */ int ntfs_fix_post_read(struct NTFS_RECORD_HEADER *rhdr, size_t bytes, bool simple) { int ret; u16 *fixup, *ptr; u16 sample, fo, fn; fo = le16_to_cpu(rhdr->fix_off); fn = simple ? ((bytes >> SECTOR_SHIFT) + 1) : le16_to_cpu(rhdr->fix_num); /* Check errors. */ if ((fo & 1) || fo + fn * sizeof(short) > SECTOR_SIZE || !fn-- || fn * SECTOR_SIZE > bytes) { return -E_NTFS_CORRUPT; } /* Get fixup pointer. */ fixup = Add2Ptr(rhdr, fo); sample = *fixup; ptr = Add2Ptr(rhdr, SECTOR_SIZE - sizeof(short)); ret = 0; while (fn--) { /* Test current word. */ if (*ptr != sample) { /* Fixup does not match! Is it serious error? */ ret = -E_NTFS_FIXUP; } /* Replace fixup. */ *ptr = *++fixup; ptr += SECTOR_SIZE / sizeof(short); } return ret; } /* * ntfs_extend_init - Load $Extend file. */ int ntfs_extend_init(struct ntfs_sb_info *sbi) { int err; struct super_block *sb = sbi->sb; struct inode *inode, *inode2; struct MFT_REF ref; if (sbi->volume.major_ver < 3) { ntfs_notice(sb, "Skip $Extend 'cause NTFS version"); return 0; } ref.low = cpu_to_le32(MFT_REC_EXTEND); ref.high = 0; ref.seq = cpu_to_le16(MFT_REC_EXTEND); inode = ntfs_iget5(sb, &ref, &NAME_EXTEND); if (IS_ERR(inode)) { err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $Extend (%d).", err); inode = NULL; goto out; } /* If ntfs_iget5() reads from disk it never returns bad inode. */ if (!S_ISDIR(inode->i_mode)) { err = -EINVAL; goto out; } /* Try to find $ObjId */ inode2 = dir_search_u(inode, &NAME_OBJID, NULL); if (inode2 && !IS_ERR(inode2)) { if (is_bad_inode(inode2)) { iput(inode2); } else { sbi->objid.ni = ntfs_i(inode2); sbi->objid_no = inode2->i_ino; } } /* Try to find $Quota */ inode2 = dir_search_u(inode, &NAME_QUOTA, NULL); if (inode2 && !IS_ERR(inode2)) { sbi->quota_no = inode2->i_ino; iput(inode2); } /* Try to find $Reparse */ inode2 = dir_search_u(inode, &NAME_REPARSE, NULL); if (inode2 && !IS_ERR(inode2)) { sbi->reparse.ni = ntfs_i(inode2); sbi->reparse_no = inode2->i_ino; } /* Try to find $UsnJrnl */ inode2 = dir_search_u(inode, &NAME_USNJRNL, NULL); if (inode2 && !IS_ERR(inode2)) { sbi->usn_jrnl_no = inode2->i_ino; iput(inode2); } err = 0; out: iput(inode); return err; } int ntfs_loadlog_and_replay(struct ntfs_inode *ni, struct ntfs_sb_info *sbi) { int err = 0; struct super_block *sb = sbi->sb; bool initialized = false; struct MFT_REF ref; struct inode *inode; /* Check for 4GB. */ if (ni->vfs_inode.i_size >= 0x100000000ull) { ntfs_err(sb, "\x24LogFile is large than 4G."); err = -EINVAL; goto out; } sbi->flags |= NTFS_FLAGS_LOG_REPLAYING; ref.low = cpu_to_le32(MFT_REC_MFT); ref.high = 0; ref.seq = cpu_to_le16(1); inode = ntfs_iget5(sb, &ref, NULL); if (IS_ERR(inode)) inode = NULL; if (!inode) { /* Try to use MFT copy. */ u64 t64 = sbi->mft.lbo; sbi->mft.lbo = sbi->mft.lbo2; inode = ntfs_iget5(sb, &ref, NULL); sbi->mft.lbo = t64; if (IS_ERR(inode)) inode = NULL; } if (!inode) { err = -EINVAL; ntfs_err(sb, "Failed to load $MFT."); goto out; } sbi->mft.ni = ntfs_i(inode); /* LogFile should not contains attribute list. */ err = ni_load_all_mi(sbi->mft.ni); if (!err) err = log_replay(ni, &initialized); iput(inode); sbi->mft.ni = NULL; sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); if (sbi->flags & NTFS_FLAGS_NEED_REPLAY) { err = 0; goto out; } if (sb_rdonly(sb) || !initialized) goto out; /* Fill LogFile by '-1' if it is initialized. */ err = ntfs_bio_fill_1(sbi, &ni->file.run); out: sbi->flags &= ~NTFS_FLAGS_LOG_REPLAYING; return err; } /* * ntfs_look_for_free_space - Look for a free space in bitmap. */ int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len, CLST *new_lcn, CLST *new_len, enum ALLOCATE_OPT opt) { int err; CLST alen; struct super_block *sb = sbi->sb; size_t alcn, zlen, zeroes, zlcn, zlen2, ztrim, new_zlen; struct wnd_bitmap *wnd = &sbi->used.bitmap; down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS); if (opt & ALLOCATE_MFT) { zlen = wnd_zone_len(wnd); if (!zlen) { err = ntfs_refresh_zone(sbi); if (err) goto up_write; zlen = wnd_zone_len(wnd); } if (!zlen) { ntfs_err(sbi->sb, "no free space to extend mft"); err = -ENOSPC; goto up_write; } lcn = wnd_zone_bit(wnd); alen = min_t(CLST, len, zlen); wnd_zone_set(wnd, lcn + alen, zlen - alen); err = wnd_set_used(wnd, lcn, alen); if (err) goto up_write; alcn = lcn; goto space_found; } /* * 'Cause cluster 0 is always used this value means that we should use * cached value of 'next_free_lcn' to improve performance. */ if (!lcn) lcn = sbi->used.next_free_lcn; if (lcn >= wnd->nbits) lcn = 0; alen = wnd_find(wnd, len, lcn, BITMAP_FIND_MARK_AS_USED, &alcn); if (alen) goto space_found; /* Try to use clusters from MftZone. */ zlen = wnd_zone_len(wnd); zeroes = wnd_zeroes(wnd); /* Check too big request */ if (len > zeroes + zlen || zlen <= NTFS_MIN_MFT_ZONE) { err = -ENOSPC; goto up_write; } /* How many clusters to cat from zone. */ zlcn = wnd_zone_bit(wnd); zlen2 = zlen >> 1; ztrim = clamp_val(len, zlen2, zlen); new_zlen = max_t(size_t, zlen - ztrim, NTFS_MIN_MFT_ZONE); wnd_zone_set(wnd, zlcn, new_zlen); /* Allocate continues clusters. */ alen = wnd_find(wnd, len, 0, BITMAP_FIND_MARK_AS_USED | BITMAP_FIND_FULL, &alcn); if (!alen) { err = -ENOSPC; goto up_write; } space_found: err = 0; *new_len = alen; *new_lcn = alcn; ntfs_unmap_meta(sb, alcn, alen); /* Set hint for next requests. */ if (!(opt & ALLOCATE_MFT)) sbi->used.next_free_lcn = alcn + alen; up_write: up_write(&wnd->rw_lock); return err; } /* * ntfs_check_for_free_space * * Check if it is possible to allocate 'clen' clusters and 'mlen' Mft records */ bool ntfs_check_for_free_space(struct ntfs_sb_info *sbi, CLST clen, CLST mlen) { size_t free, zlen, avail; struct wnd_bitmap *wnd; wnd = &sbi->used.bitmap; down_read_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS); free = wnd_zeroes(wnd); zlen = min_t(size_t, NTFS_MIN_MFT_ZONE, wnd_zone_len(wnd)); up_read(&wnd->rw_lock); if (free < zlen + clen) return false; avail = free - (zlen + clen); wnd = &sbi->mft.bitmap; down_read_nested(&wnd->rw_lock, BITMAP_MUTEX_MFT); free = wnd_zeroes(wnd); zlen = wnd_zone_len(wnd); up_read(&wnd->rw_lock); if (free >= zlen + mlen) return true; return avail >= bytes_to_cluster(sbi, mlen << sbi->record_bits); } /* * ntfs_extend_mft - Allocate additional MFT records. * * sbi->mft.bitmap is locked for write. * * NOTE: recursive: * ntfs_look_free_mft -> * ntfs_extend_mft -> * attr_set_size -> * ni_insert_nonresident -> * ni_insert_attr -> * ni_ins_attr_ext -> * ntfs_look_free_mft -> * ntfs_extend_mft * * To avoid recursive always allocate space for two new MFT records * see attrib.c: "at least two MFT to avoid recursive loop". */ static int ntfs_extend_mft(struct ntfs_sb_info *sbi) { int err; struct ntfs_inode *ni = sbi->mft.ni; size_t new_mft_total; u64 new_mft_bytes, new_bitmap_bytes; struct ATTRIB *attr; struct wnd_bitmap *wnd = &sbi->mft.bitmap; new_mft_total = ALIGN(wnd->nbits + NTFS_MFT_INCREASE_STEP, 128); new_mft_bytes = (u64)new_mft_total << sbi->record_bits; /* Step 1: Resize $MFT::DATA. */ down_write(&ni->file.run_lock); err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, new_mft_bytes, NULL, false, &attr); if (err) { up_write(&ni->file.run_lock); goto out; } attr->nres.valid_size = attr->nres.data_size; new_mft_total = le64_to_cpu(attr->nres.alloc_size) >> sbi->record_bits; ni->mi.dirty = true; /* Step 2: Resize $MFT::BITMAP. */ new_bitmap_bytes = bitmap_size(new_mft_total); err = attr_set_size(ni, ATTR_BITMAP, NULL, 0, &sbi->mft.bitmap.run, new_bitmap_bytes, &new_bitmap_bytes, true, NULL); /* Refresh MFT Zone if necessary. */ down_write_nested(&sbi->used.bitmap.rw_lock, BITMAP_MUTEX_CLUSTERS); ntfs_refresh_zone(sbi); up_write(&sbi->used.bitmap.rw_lock); up_write(&ni->file.run_lock); if (err) goto out; err = wnd_extend(wnd, new_mft_total); if (err) goto out; ntfs_clear_mft_tail(sbi, sbi->mft.used, new_mft_total); err = _ni_write_inode(&ni->vfs_inode, 0); out: return err; } /* * ntfs_look_free_mft - Look for a free MFT record. */ int ntfs_look_free_mft(struct ntfs_sb_info *sbi, CLST *rno, bool mft, struct ntfs_inode *ni, struct mft_inode **mi) { int err = 0; size_t zbit, zlen, from, to, fr; size_t mft_total; struct MFT_REF ref; struct super_block *sb = sbi->sb; struct wnd_bitmap *wnd = &sbi->mft.bitmap; u32 ir; static_assert(sizeof(sbi->mft.reserved_bitmap) * 8 >= MFT_REC_FREE - MFT_REC_RESERVED); if (!mft) down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_MFT); zlen = wnd_zone_len(wnd); /* Always reserve space for MFT. */ if (zlen) { if (mft) { zbit = wnd_zone_bit(wnd); *rno = zbit; wnd_zone_set(wnd, zbit + 1, zlen - 1); } goto found; } /* No MFT zone. Find the nearest to '0' free MFT. */ if (!wnd_find(wnd, 1, MFT_REC_FREE, 0, &zbit)) { /* Resize MFT */ mft_total = wnd->nbits; err = ntfs_extend_mft(sbi); if (!err) { zbit = mft_total; goto reserve_mft; } if (!mft || MFT_REC_FREE == sbi->mft.next_reserved) goto out; err = 0; /* * Look for free record reserved area [11-16) == * [MFT_REC_RESERVED, MFT_REC_FREE ) MFT bitmap always * marks it as used. */ if (!sbi->mft.reserved_bitmap) { /* Once per session create internal bitmap for 5 bits. */ sbi->mft.reserved_bitmap = 0xFF; ref.high = 0; for (ir = MFT_REC_RESERVED; ir < MFT_REC_FREE; ir++) { struct inode *i; struct ntfs_inode *ni; struct MFT_REC *mrec; ref.low = cpu_to_le32(ir); ref.seq = cpu_to_le16(ir); i = ntfs_iget5(sb, &ref, NULL); if (IS_ERR(i)) { next: ntfs_notice( sb, "Invalid reserved record %x", ref.low); continue; } if (is_bad_inode(i)) { iput(i); goto next; } ni = ntfs_i(i); mrec = ni->mi.mrec; if (!is_rec_base(mrec)) goto next; if (mrec->hard_links) goto next; if (!ni_std(ni)) goto next; if (ni_find_attr(ni, NULL, NULL, ATTR_NAME, NULL, 0, NULL, NULL)) goto next; __clear_bit(ir - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap); } } /* Scan 5 bits for zero. Bit 0 == MFT_REC_RESERVED */ zbit = find_next_zero_bit(&sbi->mft.reserved_bitmap, MFT_REC_FREE, MFT_REC_RESERVED); if (zbit >= MFT_REC_FREE) { sbi->mft.next_reserved = MFT_REC_FREE; goto out; } zlen = 1; sbi->mft.next_reserved = zbit; } else { reserve_mft: zlen = zbit == MFT_REC_FREE ? (MFT_REC_USER - MFT_REC_FREE) : 4; if (zbit + zlen > wnd->nbits) zlen = wnd->nbits - zbit; while (zlen > 1 && !wnd_is_free(wnd, zbit, zlen)) zlen -= 1; /* [zbit, zbit + zlen) will be used for MFT itself. */ from = sbi->mft.used; if (from < zbit) from = zbit; to = zbit + zlen; if (from < to) { ntfs_clear_mft_tail(sbi, from, to); sbi->mft.used = to; } } if (mft) { *rno = zbit; zbit += 1; zlen -= 1; } wnd_zone_set(wnd, zbit, zlen); found: if (!mft) { /* The request to get record for general purpose. */ if (sbi->mft.next_free < MFT_REC_USER) sbi->mft.next_free = MFT_REC_USER; for (;;) { if (sbi->mft.next_free >= sbi->mft.bitmap.nbits) { } else if (!wnd_find(wnd, 1, MFT_REC_USER, 0, &fr)) { sbi->mft.next_free = sbi->mft.bitmap.nbits; } else { *rno = fr; sbi->mft.next_free = *rno + 1; break; } err = ntfs_extend_mft(sbi); if (err) goto out; } } if (ni && !ni_add_subrecord(ni, *rno, mi)) { err = -ENOMEM; goto out; } /* We have found a record that are not reserved for next MFT. */ if (*rno >= MFT_REC_FREE) wnd_set_used(wnd, *rno, 1); else if (*rno >= MFT_REC_RESERVED && sbi->mft.reserved_bitmap_inited) __set_bit(*rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap); out: if (!mft) up_write(&wnd->rw_lock); return err; } /* * ntfs_mark_rec_free - Mark record as free. * is_mft - true if we are changing MFT */ void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno, bool is_mft) { struct wnd_bitmap *wnd = &sbi->mft.bitmap; if (!is_mft) down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_MFT); if (rno >= wnd->nbits) goto out; if (rno >= MFT_REC_FREE) { if (!wnd_is_used(wnd, rno, 1)) ntfs_set_state(sbi, NTFS_DIRTY_ERROR); else wnd_set_free(wnd, rno, 1); } else if (rno >= MFT_REC_RESERVED && sbi->mft.reserved_bitmap_inited) { __clear_bit(rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap); } if (rno < wnd_zone_bit(wnd)) wnd_zone_set(wnd, rno, 1); else if (rno < sbi->mft.next_free && rno >= MFT_REC_USER) sbi->mft.next_free = rno; out: if (!is_mft) up_write(&wnd->rw_lock); } /* * ntfs_clear_mft_tail - Format empty records [from, to). * * sbi->mft.bitmap is locked for write. */ int ntfs_clear_mft_tail(struct ntfs_sb_info *sbi, size_t from, size_t to) { int err; u32 rs; u64 vbo; struct runs_tree *run; struct ntfs_inode *ni; if (from >= to) return 0; rs = sbi->record_size; ni = sbi->mft.ni; run = &ni->file.run; down_read(&ni->file.run_lock); vbo = (u64)from * rs; for (; from < to; from++, vbo += rs) { struct ntfs_buffers nb; err = ntfs_get_bh(sbi, run, vbo, rs, &nb); if (err) goto out; err = ntfs_write_bh(sbi, &sbi->new_rec->rhdr, &nb, 0); nb_put(&nb); if (err) goto out; } out: sbi->mft.used = from; up_read(&ni->file.run_lock); return err; } /* * ntfs_refresh_zone - Refresh MFT zone. * * sbi->used.bitmap is locked for rw. * sbi->mft.bitmap is locked for write. * sbi->mft.ni->file.run_lock for write. */ int ntfs_refresh_zone(struct ntfs_sb_info *sbi) { CLST lcn, vcn, len; size_t lcn_s, zlen; struct wnd_bitmap *wnd = &sbi->used.bitmap; struct ntfs_inode *ni = sbi->mft.ni; /* Do not change anything unless we have non empty MFT zone. */ if (wnd_zone_len(wnd)) return 0; vcn = bytes_to_cluster(sbi, (u64)sbi->mft.bitmap.nbits << sbi->record_bits); if (!run_lookup_entry(&ni->file.run, vcn - 1, &lcn, &len, NULL)) lcn = SPARSE_LCN; /* We should always find Last Lcn for MFT. */ if (lcn == SPARSE_LCN) return -EINVAL; lcn_s = lcn + 1; /* Try to allocate clusters after last MFT run. */ zlen = wnd_find(wnd, sbi->zone_max, lcn_s, 0, &lcn_s); wnd_zone_set(wnd, lcn_s, zlen); return 0; } /* * ntfs_update_mftmirr - Update $MFTMirr data. */ void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait) { int err; struct super_block *sb = sbi->sb; u32 blocksize, bytes; sector_t block1, block2; /* * sb can be NULL here. In this case sbi->flags should be 0 too. */ if (!sb || !(sbi->flags & NTFS_FLAGS_MFTMIRR)) return; blocksize = sb->s_blocksize; bytes = sbi->mft.recs_mirr << sbi->record_bits; block1 = sbi->mft.lbo >> sb->s_blocksize_bits; block2 = sbi->mft.lbo2 >> sb->s_blocksize_bits; for (; bytes >= blocksize; bytes -= blocksize) { struct buffer_head *bh1, *bh2; bh1 = sb_bread(sb, block1++); if (!bh1) return; bh2 = sb_getblk(sb, block2++); if (!bh2) { put_bh(bh1); return; } if (buffer_locked(bh2)) __wait_on_buffer(bh2); lock_buffer(bh2); memcpy(bh2->b_data, bh1->b_data, blocksize); set_buffer_uptodate(bh2); mark_buffer_dirty(bh2); unlock_buffer(bh2); put_bh(bh1); bh1 = NULL; err = wait ? sync_dirty_buffer(bh2) : 0; put_bh(bh2); if (err) return; } sbi->flags &= ~NTFS_FLAGS_MFTMIRR; } /* * ntfs_bad_inode * * Marks inode as bad and marks fs as 'dirty' */ void ntfs_bad_inode(struct inode *inode, const char *hint) { struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info; ntfs_inode_err(inode, "%s", hint); make_bad_inode(inode); ntfs_set_state(sbi, NTFS_DIRTY_ERROR); } /* * ntfs_set_state * * Mount: ntfs_set_state(NTFS_DIRTY_DIRTY) * Umount: ntfs_set_state(NTFS_DIRTY_CLEAR) * NTFS error: ntfs_set_state(NTFS_DIRTY_ERROR) */ int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty) { int err; struct ATTRIB *attr; struct VOLUME_INFO *info; struct mft_inode *mi; struct ntfs_inode *ni; __le16 info_flags; /* * Do not change state if fs was real_dirty. * Do not change state if fs already dirty(clear). * Do not change any thing if mounted read only. */ if (sbi->volume.real_dirty || sb_rdonly(sbi->sb)) return 0; /* Check cached value. */ if ((dirty == NTFS_DIRTY_CLEAR ? 0 : VOLUME_FLAG_DIRTY) == (sbi->volume.flags & VOLUME_FLAG_DIRTY)) return 0; ni = sbi->volume.ni; if (!ni) return -EINVAL; mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_DIRTY); attr = ni_find_attr(ni, NULL, NULL, ATTR_VOL_INFO, NULL, 0, NULL, &mi); if (!attr) { err = -EINVAL; goto out; } info = resident_data_ex(attr, SIZEOF_ATTRIBUTE_VOLUME_INFO); if (!info) { err = -EINVAL; goto out; } info_flags = info->flags; switch (dirty) { case NTFS_DIRTY_ERROR: ntfs_notice(sbi->sb, "Mark volume as dirty due to NTFS errors"); sbi->volume.real_dirty = true; fallthrough; case NTFS_DIRTY_DIRTY: info->flags |= VOLUME_FLAG_DIRTY; break; case NTFS_DIRTY_CLEAR: info->flags &= ~VOLUME_FLAG_DIRTY; break; } /* Cache current volume flags. */ if (info_flags != info->flags) { sbi->volume.flags = info->flags; mi->dirty = true; } err = 0; out: ni_unlock(ni); if (err) return err; mark_inode_dirty_sync(&ni->vfs_inode); /* verify(!ntfs_update_mftmirr()); */ /* write mft record on disk. */ err = _ni_write_inode(&ni->vfs_inode, 1); return err; } /* * security_hash - Calculates a hash of security descriptor. */ static inline __le32 security_hash(const void *sd, size_t bytes) { u32 hash = 0; const __le32 *ptr = sd; bytes >>= 2; while (bytes--) hash = ((hash >> 0x1D) | (hash << 3)) + le32_to_cpu(*ptr++); return cpu_to_le32(hash); } int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer) { struct block_device *bdev = sb->s_bdev; u32 blocksize = sb->s_blocksize; u64 block = lbo >> sb->s_blocksize_bits; u32 off = lbo & (blocksize - 1); u32 op = blocksize - off; for (; bytes; block += 1, off = 0, op = blocksize) { struct buffer_head *bh = __bread(bdev, block, blocksize); if (!bh) return -EIO; if (op > bytes) op = bytes; memcpy(buffer, bh->b_data + off, op); put_bh(bh); bytes -= op; buffer = Add2Ptr(buffer, op); } return 0; } int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes, const void *buf, int wait) { u32 blocksize = sb->s_blocksize; struct block_device *bdev = sb->s_bdev; sector_t block = lbo >> sb->s_blocksize_bits; u32 off = lbo & (blocksize - 1); u32 op = blocksize - off; struct buffer_head *bh; if (!wait && (sb->s_flags & SB_SYNCHRONOUS)) wait = 1; for (; bytes; block += 1, off = 0, op = blocksize) { if (op > bytes) op = bytes; if (op < blocksize) { bh = __bread(bdev, block, blocksize); if (!bh) { ntfs_err(sb, "failed to read block %llx", (u64)block); return -EIO; } } else { bh = __getblk(bdev, block, blocksize); if (!bh) return -ENOMEM; } if (buffer_locked(bh)) __wait_on_buffer(bh); lock_buffer(bh); if (buf) { memcpy(bh->b_data + off, buf, op); buf = Add2Ptr(buf, op); } else { memset(bh->b_data + off, -1, op); } set_buffer_uptodate(bh); mark_buffer_dirty(bh); unlock_buffer(bh); if (wait) { int err = sync_dirty_buffer(bh); if (err) { ntfs_err( sb, "failed to sync buffer at block %llx, error %d", (u64)block, err); put_bh(bh); return err; } } put_bh(bh); bytes -= op; } return 0; } int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, const void *buf, size_t bytes, int sync) { struct super_block *sb = sbi->sb; u8 cluster_bits = sbi->cluster_bits; u32 off = vbo & sbi->cluster_mask; CLST lcn, clen, vcn = vbo >> cluster_bits, vcn_next; u64 lbo, len; size_t idx; if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) return -ENOENT; if (lcn == SPARSE_LCN) return -EINVAL; lbo = ((u64)lcn << cluster_bits) + off; len = ((u64)clen << cluster_bits) - off; for (;;) { u32 op = min_t(u64, len, bytes); int err = ntfs_sb_write(sb, lbo, op, buf, sync); if (err) return err; bytes -= op; if (!bytes) break; vcn_next = vcn + clen; if (!run_get_entry(run, ++idx, &vcn, &lcn, &clen) || vcn != vcn_next) return -ENOENT; if (lcn == SPARSE_LCN) return -EINVAL; if (buf) buf = Add2Ptr(buf, op); lbo = ((u64)lcn << cluster_bits); len = ((u64)clen << cluster_bits); } return 0; } struct buffer_head *ntfs_bread_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo) { struct super_block *sb = sbi->sb; u8 cluster_bits = sbi->cluster_bits; CLST lcn; u64 lbo; if (!run_lookup_entry(run, vbo >> cluster_bits, &lcn, NULL, NULL)) return ERR_PTR(-ENOENT); lbo = ((u64)lcn << cluster_bits) + (vbo & sbi->cluster_mask); return ntfs_bread(sb, lbo >> sb->s_blocksize_bits); } int ntfs_read_run_nb(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, void *buf, u32 bytes, struct ntfs_buffers *nb) { int err; struct super_block *sb = sbi->sb; u32 blocksize = sb->s_blocksize; u8 cluster_bits = sbi->cluster_bits; u32 off = vbo & sbi->cluster_mask; u32 nbh = 0; CLST vcn_next, vcn = vbo >> cluster_bits; CLST lcn, clen; u64 lbo, len; size_t idx; struct buffer_head *bh; if (!run) { /* First reading of $Volume + $MFTMirr + $LogFile goes here. */ if (vbo > MFT_REC_VOL * sbi->record_size) { err = -ENOENT; goto out; } /* Use absolute boot's 'MFTCluster' to read record. */ lbo = vbo + sbi->mft.lbo; len = sbi->record_size; } else if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) { err = -ENOENT; goto out; } else { if (lcn == SPARSE_LCN) { err = -EINVAL; goto out; } lbo = ((u64)lcn << cluster_bits) + off; len = ((u64)clen << cluster_bits) - off; } off = lbo & (blocksize - 1); if (nb) { nb->off = off; nb->bytes = bytes; } for (;;) { u32 len32 = len >= bytes ? bytes : len; sector_t block = lbo >> sb->s_blocksize_bits; do { u32 op = blocksize - off; if (op > len32) op = len32; bh = ntfs_bread(sb, block); if (!bh) { err = -EIO; goto out; } if (buf) { memcpy(buf, bh->b_data + off, op); buf = Add2Ptr(buf, op); } if (!nb) { put_bh(bh); } else if (nbh >= ARRAY_SIZE(nb->bh)) { err = -EINVAL; goto out; } else { nb->bh[nbh++] = bh; nb->nbufs = nbh; } bytes -= op; if (!bytes) return 0; len32 -= op; block += 1; off = 0; } while (len32); vcn_next = vcn + clen; if (!run_get_entry(run, ++idx, &vcn, &lcn, &clen) || vcn != vcn_next) { err = -ENOENT; goto out; } if (lcn == SPARSE_LCN) { err = -EINVAL; goto out; } lbo = ((u64)lcn << cluster_bits); len = ((u64)clen << cluster_bits); } out: if (!nbh) return err; while (nbh) { put_bh(nb->bh[--nbh]); nb->bh[nbh] = NULL; } nb->nbufs = 0; return err; } /* * ntfs_read_bh * * Return: < 0 if error, 0 if ok, -E_NTFS_FIXUP if need to update fixups. */ int ntfs_read_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, struct NTFS_RECORD_HEADER *rhdr, u32 bytes, struct ntfs_buffers *nb) { int err = ntfs_read_run_nb(sbi, run, vbo, rhdr, bytes, nb); if (err) return err; return ntfs_fix_post_read(rhdr, nb->bytes, true); } int ntfs_get_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, u32 bytes, struct ntfs_buffers *nb) { int err = 0; struct super_block *sb = sbi->sb; u32 blocksize = sb->s_blocksize; u8 cluster_bits = sbi->cluster_bits; CLST vcn_next, vcn = vbo >> cluster_bits; u32 off; u32 nbh = 0; CLST lcn, clen; u64 lbo, len; size_t idx; nb->bytes = bytes; if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) { err = -ENOENT; goto out; } off = vbo & sbi->cluster_mask; lbo = ((u64)lcn << cluster_bits) + off; len = ((u64)clen << cluster_bits) - off; nb->off = off = lbo & (blocksize - 1); for (;;) { u32 len32 = min_t(u64, len, bytes); sector_t block = lbo >> sb->s_blocksize_bits; do { u32 op; struct buffer_head *bh; if (nbh >= ARRAY_SIZE(nb->bh)) { err = -EINVAL; goto out; } op = blocksize - off; if (op > len32) op = len32; if (op == blocksize) { bh = sb_getblk(sb, block); if (!bh) { err = -ENOMEM; goto out; } if (buffer_locked(bh)) __wait_on_buffer(bh); set_buffer_uptodate(bh); } else { bh = ntfs_bread(sb, block); if (!bh) { err = -EIO; goto out; } } nb->bh[nbh++] = bh; bytes -= op; if (!bytes) { nb->nbufs = nbh; return 0; } block += 1; len32 -= op; off = 0; } while (len32); vcn_next = vcn + clen; if (!run_get_entry(run, ++idx, &vcn, &lcn, &clen) || vcn != vcn_next) { err = -ENOENT; goto out; } lbo = ((u64)lcn << cluster_bits); len = ((u64)clen << cluster_bits); } out: while (nbh) { put_bh(nb->bh[--nbh]); nb->bh[nbh] = NULL; } nb->nbufs = 0; return err; } int ntfs_write_bh(struct ntfs_sb_info *sbi, struct NTFS_RECORD_HEADER *rhdr, struct ntfs_buffers *nb, int sync) { int err = 0; struct super_block *sb = sbi->sb; u32 block_size = sb->s_blocksize; u32 bytes = nb->bytes; u32 off = nb->off; u16 fo = le16_to_cpu(rhdr->fix_off); u16 fn = le16_to_cpu(rhdr->fix_num); u32 idx; __le16 *fixup; __le16 sample; if ((fo & 1) || fo + fn * sizeof(short) > SECTOR_SIZE || !fn-- || fn * SECTOR_SIZE > bytes) { return -EINVAL; } for (idx = 0; bytes && idx < nb->nbufs; idx += 1, off = 0) { u32 op = block_size - off; char *bh_data; struct buffer_head *bh = nb->bh[idx]; __le16 *ptr, *end_data; if (op > bytes) op = bytes; if (buffer_locked(bh)) __wait_on_buffer(bh); lock_buffer(bh); bh_data = bh->b_data + off; end_data = Add2Ptr(bh_data, op); memcpy(bh_data, rhdr, op); if (!idx) { u16 t16; fixup = Add2Ptr(bh_data, fo); sample = *fixup; t16 = le16_to_cpu(sample); if (t16 >= 0x7FFF) { sample = *fixup = cpu_to_le16(1); } else { sample = cpu_to_le16(t16 + 1); *fixup = sample; } *(__le16 *)Add2Ptr(rhdr, fo) = sample; } ptr = Add2Ptr(bh_data, SECTOR_SIZE - sizeof(short)); do { *++fixup = *ptr; *ptr = sample; ptr += SECTOR_SIZE / sizeof(short); } while (ptr < end_data); set_buffer_uptodate(bh); mark_buffer_dirty(bh); unlock_buffer(bh); if (sync) { int err2 = sync_dirty_buffer(bh); if (!err && err2) err = err2; } bytes -= op; rhdr = Add2Ptr(rhdr, op); } return err; } /* * ntfs_bio_pages - Read/write pages from/to disk. */ int ntfs_bio_pages(struct ntfs_sb_info *sbi, const struct runs_tree *run, struct page **pages, u32 nr_pages, u64 vbo, u32 bytes, enum req_op op) { int err = 0; struct bio *new, *bio = NULL; struct super_block *sb = sbi->sb; struct block_device *bdev = sb->s_bdev; struct page *page; u8 cluster_bits = sbi->cluster_bits; CLST lcn, clen, vcn, vcn_next; u32 add, off, page_idx; u64 lbo, len; size_t run_idx; struct blk_plug plug; if (!bytes) return 0; blk_start_plug(&plug); /* Align vbo and bytes to be 512 bytes aligned. */ lbo = (vbo + bytes + 511) & ~511ull; vbo = vbo & ~511ull; bytes = lbo - vbo; vcn = vbo >> cluster_bits; if (!run_lookup_entry(run, vcn, &lcn, &clen, &run_idx)) { err = -ENOENT; goto out; } off = vbo & sbi->cluster_mask; page_idx = 0; page = pages[0]; for (;;) { lbo = ((u64)lcn << cluster_bits) + off; len = ((u64)clen << cluster_bits) - off; new_bio: new = bio_alloc(bdev, nr_pages - page_idx, op, GFP_NOFS); if (bio) { bio_chain(bio, new); submit_bio(bio); } bio = new; bio->bi_iter.bi_sector = lbo >> 9; while (len) { off = vbo & (PAGE_SIZE - 1); add = off + len > PAGE_SIZE ? (PAGE_SIZE - off) : len; if (bio_add_page(bio, page, add, off) < add) goto new_bio; if (bytes <= add) goto out; bytes -= add; vbo += add; if (add + off == PAGE_SIZE) { page_idx += 1; if (WARN_ON(page_idx >= nr_pages)) { err = -EINVAL; goto out; } page = pages[page_idx]; } if (len <= add) break; len -= add; lbo += add; } vcn_next = vcn + clen; if (!run_get_entry(run, ++run_idx, &vcn, &lcn, &clen) || vcn != vcn_next) { err = -ENOENT; goto out; } off = 0; } out: if (bio) { if (!err) err = submit_bio_wait(bio); bio_put(bio); } blk_finish_plug(&plug); return err; } /* * ntfs_bio_fill_1 - Helper for ntfs_loadlog_and_replay(). * * Fill on-disk logfile range by (-1) * this means empty logfile. */ int ntfs_bio_fill_1(struct ntfs_sb_info *sbi, const struct runs_tree *run) { int err = 0; struct super_block *sb = sbi->sb; struct block_device *bdev = sb->s_bdev; u8 cluster_bits = sbi->cluster_bits; struct bio *new, *bio = NULL; CLST lcn, clen; u64 lbo, len; size_t run_idx; struct page *fill; void *kaddr; struct blk_plug plug; fill = alloc_page(GFP_KERNEL); if (!fill) return -ENOMEM; kaddr = kmap_atomic(fill); memset(kaddr, -1, PAGE_SIZE); kunmap_atomic(kaddr); flush_dcache_page(fill); lock_page(fill); if (!run_lookup_entry(run, 0, &lcn, &clen, &run_idx)) { err = -ENOENT; goto out; } /* * TODO: Try blkdev_issue_write_same. */ blk_start_plug(&plug); do { lbo = (u64)lcn << cluster_bits; len = (u64)clen << cluster_bits; new_bio: new = bio_alloc(bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOFS); if (bio) { bio_chain(bio, new); submit_bio(bio); } bio = new; bio->bi_iter.bi_sector = lbo >> 9; for (;;) { u32 add = len > PAGE_SIZE ? PAGE_SIZE : len; if (bio_add_page(bio, fill, add, 0) < add) goto new_bio; lbo += add; if (len <= add) break; len -= add; } } while (run_get_entry(run, ++run_idx, NULL, &lcn, &clen)); if (!err) err = submit_bio_wait(bio); bio_put(bio); blk_finish_plug(&plug); out: unlock_page(fill); put_page(fill); return err; } int ntfs_vbo_to_lbo(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, u64 *lbo, u64 *bytes) { u32 off; CLST lcn, len; u8 cluster_bits = sbi->cluster_bits; if (!run_lookup_entry(run, vbo >> cluster_bits, &lcn, &len, NULL)) return -ENOENT; off = vbo & sbi->cluster_mask; *lbo = lcn == SPARSE_LCN ? -1 : (((u64)lcn << cluster_bits) + off); *bytes = ((u64)len << cluster_bits) - off; return 0; } struct ntfs_inode *ntfs_new_inode(struct ntfs_sb_info *sbi, CLST rno, enum RECORD_FLAG flag) { int err = 0; struct super_block *sb = sbi->sb; struct inode *inode = new_inode(sb); struct ntfs_inode *ni; if (!inode) return ERR_PTR(-ENOMEM); ni = ntfs_i(inode); err = mi_format_new(&ni->mi, sbi, rno, flag, false); if (err) goto out; inode->i_ino = rno; if (insert_inode_locked(inode) < 0) { err = -EIO; goto out; } out: if (err) { make_bad_inode(inode); iput(inode); ni = ERR_PTR(err); } return ni; } /* * O:BAG:BAD:(A;OICI;FA;;;WD) * Owner S-1-5-32-544 (Administrators) * Group S-1-5-32-544 (Administrators) * ACE: allow S-1-1-0 (Everyone) with FILE_ALL_ACCESS */ const u8 s_default_security[] __aligned(8) = { 0x01, 0x00, 0x04, 0x80, 0x30, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x02, 0x00, 0x1C, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x03, 0x14, 0x00, 0xFF, 0x01, 0x1F, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x20, 0x00, 0x00, 0x00, 0x20, 0x02, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x20, 0x00, 0x00, 0x00, 0x20, 0x02, 0x00, 0x00, }; static_assert(sizeof(s_default_security) == 0x50); static inline u32 sid_length(const struct SID *sid) { return struct_size(sid, SubAuthority, sid->SubAuthorityCount); } /* * is_acl_valid * * Thanks Mark Harmstone for idea. */ static bool is_acl_valid(const struct ACL *acl, u32 len) { const struct ACE_HEADER *ace; u32 i; u16 ace_count, ace_size; if (acl->AclRevision != ACL_REVISION && acl->AclRevision != ACL_REVISION_DS) { /* * This value should be ACL_REVISION, unless the ACL contains an * object-specific ACE, in which case this value must be ACL_REVISION_DS. * All ACEs in an ACL must be at the same revision level. */ return false; } if (acl->Sbz1) return false; if (le16_to_cpu(acl->AclSize) > len) return false; if (acl->Sbz2) return false; len -= sizeof(struct ACL); ace = (struct ACE_HEADER *)&acl[1]; ace_count = le16_to_cpu(acl->AceCount); for (i = 0; i < ace_count; i++) { if (len < sizeof(struct ACE_HEADER)) return false; ace_size = le16_to_cpu(ace->AceSize); if (len < ace_size) return false; len -= ace_size; ace = Add2Ptr(ace, ace_size); } return true; } bool is_sd_valid(const struct SECURITY_DESCRIPTOR_RELATIVE *sd, u32 len) { u32 sd_owner, sd_group, sd_sacl, sd_dacl; if (len < sizeof(struct SECURITY_DESCRIPTOR_RELATIVE)) return false; if (sd->Revision != 1) return false; if (sd->Sbz1) return false; if (!(sd->Control & SE_SELF_RELATIVE)) return false; sd_owner = le32_to_cpu(sd->Owner); if (sd_owner) { const struct SID *owner = Add2Ptr(sd, sd_owner); if (sd_owner + offsetof(struct SID, SubAuthority) > len) return false; if (owner->Revision != 1) return false; if (sd_owner + sid_length(owner) > len) return false; } sd_group = le32_to_cpu(sd->Group); if (sd_group) { const struct SID *group = Add2Ptr(sd, sd_group); if (sd_group + offsetof(struct SID, SubAuthority) > len) return false; if (group->Revision != 1) return false; if (sd_group + sid_length(group) > len) return false; } sd_sacl = le32_to_cpu(sd->Sacl); if (sd_sacl) { const struct ACL *sacl = Add2Ptr(sd, sd_sacl); if (sd_sacl + sizeof(struct ACL) > len) return false; if (!is_acl_valid(sacl, len - sd_sacl)) return false; } sd_dacl = le32_to_cpu(sd->Dacl); if (sd_dacl) { const struct ACL *dacl = Add2Ptr(sd, sd_dacl); if (sd_dacl + sizeof(struct ACL) > len) return false; if (!is_acl_valid(dacl, len - sd_dacl)) return false; } return true; } /* * ntfs_security_init - Load and parse $Secure. */ int ntfs_security_init(struct ntfs_sb_info *sbi) { int err; struct super_block *sb = sbi->sb; struct inode *inode; struct ntfs_inode *ni; struct MFT_REF ref; struct ATTRIB *attr; struct ATTR_LIST_ENTRY *le; u64 sds_size; size_t off; struct NTFS_DE *ne; struct NTFS_DE_SII *sii_e; struct ntfs_fnd *fnd_sii = NULL; const struct INDEX_ROOT *root_sii; const struct INDEX_ROOT *root_sdh; struct ntfs_index *indx_sdh = &sbi->security.index_sdh; struct ntfs_index *indx_sii = &sbi->security.index_sii; ref.low = cpu_to_le32(MFT_REC_SECURE); ref.high = 0; ref.seq = cpu_to_le16(MFT_REC_SECURE); inode = ntfs_iget5(sb, &ref, &NAME_SECURE); if (IS_ERR(inode)) { err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $Secure (%d).", err); inode = NULL; goto out; } ni = ntfs_i(inode); le = NULL; attr = ni_find_attr(ni, NULL, &le, ATTR_ROOT, SDH_NAME, ARRAY_SIZE(SDH_NAME), NULL, NULL); if (!attr || !(root_sdh = resident_data_ex(attr, sizeof(struct INDEX_ROOT))) || root_sdh->type != ATTR_ZERO || root_sdh->rule != NTFS_COLLATION_TYPE_SECURITY_HASH || offsetof(struct INDEX_ROOT, ihdr) + le32_to_cpu(root_sdh->ihdr.used) > le32_to_cpu(attr->res.data_size)) { ntfs_err(sb, "$Secure::$SDH is corrupted."); err = -EINVAL; goto out; } err = indx_init(indx_sdh, sbi, attr, INDEX_MUTEX_SDH); if (err) { ntfs_err(sb, "Failed to initialize $Secure::$SDH (%d).", err); goto out; } attr = ni_find_attr(ni, attr, &le, ATTR_ROOT, SII_NAME, ARRAY_SIZE(SII_NAME), NULL, NULL); if (!attr || !(root_sii = resident_data_ex(attr, sizeof(struct INDEX_ROOT))) || root_sii->type != ATTR_ZERO || root_sii->rule != NTFS_COLLATION_TYPE_UINT || offsetof(struct INDEX_ROOT, ihdr) + le32_to_cpu(root_sii->ihdr.used) > le32_to_cpu(attr->res.data_size)) { ntfs_err(sb, "$Secure::$SII is corrupted."); err = -EINVAL; goto out; } err = indx_init(indx_sii, sbi, attr, INDEX_MUTEX_SII); if (err) { ntfs_err(sb, "Failed to initialize $Secure::$SII (%d).", err); goto out; } fnd_sii = fnd_get(); if (!fnd_sii) { err = -ENOMEM; goto out; } sds_size = inode->i_size; /* Find the last valid Id. */ sbi->security.next_id = SECURITY_ID_FIRST; /* Always write new security at the end of bucket. */ sbi->security.next_off = ALIGN(sds_size - SecurityDescriptorsBlockSize, 16); off = 0; ne = NULL; for (;;) { u32 next_id; err = indx_find_raw(indx_sii, ni, root_sii, &ne, &off, fnd_sii); if (err || !ne) break; sii_e = (struct NTFS_DE_SII *)ne; if (le16_to_cpu(ne->view.data_size) < sizeof(sii_e->sec_hdr)) continue; next_id = le32_to_cpu(sii_e->sec_id) + 1; if (next_id >= sbi->security.next_id) sbi->security.next_id = next_id; } sbi->security.ni = ni; inode = NULL; out: iput(inode); fnd_put(fnd_sii); return err; } /* * ntfs_get_security_by_id - Read security descriptor by id. */ int ntfs_get_security_by_id(struct ntfs_sb_info *sbi, __le32 security_id, struct SECURITY_DESCRIPTOR_RELATIVE **sd, size_t *size) { int err; int diff; struct ntfs_inode *ni = sbi->security.ni; struct ntfs_index *indx = &sbi->security.index_sii; void *p = NULL; struct NTFS_DE_SII *sii_e; struct ntfs_fnd *fnd_sii; struct SECURITY_HDR d_security; const struct INDEX_ROOT *root_sii; u32 t32; *sd = NULL; mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_SECURITY); fnd_sii = fnd_get(); if (!fnd_sii) { err = -ENOMEM; goto out; } root_sii = indx_get_root(indx, ni, NULL, NULL); if (!root_sii) { err = -EINVAL; goto out; } /* Try to find this SECURITY descriptor in SII indexes. */ err = indx_find(indx, ni, root_sii, &security_id, sizeof(security_id), NULL, &diff, (struct NTFS_DE **)&sii_e, fnd_sii); if (err) goto out; if (diff) goto out; t32 = le32_to_cpu(sii_e->sec_hdr.size); if (t32 < sizeof(struct SECURITY_HDR)) { err = -EINVAL; goto out; } if (t32 > sizeof(struct SECURITY_HDR) + 0x10000) { /* Looks like too big security. 0x10000 - is arbitrary big number. */ err = -EFBIG; goto out; } *size = t32 - sizeof(struct SECURITY_HDR); p = kmalloc(*size, GFP_NOFS); if (!p) { err = -ENOMEM; goto out; } err = ntfs_read_run_nb(sbi, &ni->file.run, le64_to_cpu(sii_e->sec_hdr.off), &d_security, sizeof(d_security), NULL); if (err) goto out; if (memcmp(&d_security, &sii_e->sec_hdr, sizeof(d_security))) { err = -EINVAL; goto out; } err = ntfs_read_run_nb(sbi, &ni->file.run, le64_to_cpu(sii_e->sec_hdr.off) + sizeof(struct SECURITY_HDR), p, *size, NULL); if (err) goto out; *sd = p; p = NULL; out: kfree(p); fnd_put(fnd_sii); ni_unlock(ni); return err; } /* * ntfs_insert_security - Insert security descriptor into $Secure::SDS. * * SECURITY Descriptor Stream data is organized into chunks of 256K bytes * and it contains a mirror copy of each security descriptor. When writing * to a security descriptor at location X, another copy will be written at * location (X+256K). * When writing a security descriptor that will cross the 256K boundary, * the pointer will be advanced by 256K to skip * over the mirror portion. */ int ntfs_insert_security(struct ntfs_sb_info *sbi, const struct SECURITY_DESCRIPTOR_RELATIVE *sd, u32 size_sd, __le32 *security_id, bool *inserted) { int err, diff; struct ntfs_inode *ni = sbi->security.ni; struct ntfs_index *indx_sdh = &sbi->security.index_sdh; struct ntfs_index *indx_sii = &sbi->security.index_sii; struct NTFS_DE_SDH *e; struct NTFS_DE_SDH sdh_e; struct NTFS_DE_SII sii_e; struct SECURITY_HDR *d_security; u32 new_sec_size = size_sd + sizeof(struct SECURITY_HDR); u32 aligned_sec_size = ALIGN(new_sec_size, 16); struct SECURITY_KEY hash_key; struct ntfs_fnd *fnd_sdh = NULL; const struct INDEX_ROOT *root_sdh; const struct INDEX_ROOT *root_sii; u64 mirr_off, new_sds_size; u32 next, left; static_assert((1 << Log2OfSecurityDescriptorsBlockSize) == SecurityDescriptorsBlockSize); hash_key.hash = security_hash(sd, size_sd); hash_key.sec_id = SECURITY_ID_INVALID; if (inserted) *inserted = false; *security_id = SECURITY_ID_INVALID; /* Allocate a temporal buffer. */ d_security = kzalloc(aligned_sec_size, GFP_NOFS); if (!d_security) return -ENOMEM; mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_SECURITY); fnd_sdh = fnd_get(); if (!fnd_sdh) { err = -ENOMEM; goto out; } root_sdh = indx_get_root(indx_sdh, ni, NULL, NULL); if (!root_sdh) { err = -EINVAL; goto out; } root_sii = indx_get_root(indx_sii, ni, NULL, NULL); if (!root_sii) { err = -EINVAL; goto out; } /* * Check if such security already exists. * Use "SDH" and hash -> to get the offset in "SDS". */ err = indx_find(indx_sdh, ni, root_sdh, &hash_key, sizeof(hash_key), &d_security->key.sec_id, &diff, (struct NTFS_DE **)&e, fnd_sdh); if (err) goto out; while (e) { if (le32_to_cpu(e->sec_hdr.size) == new_sec_size) { err = ntfs_read_run_nb(sbi, &ni->file.run, le64_to_cpu(e->sec_hdr.off), d_security, new_sec_size, NULL); if (err) goto out; if (le32_to_cpu(d_security->size) == new_sec_size && d_security->key.hash == hash_key.hash && !memcmp(d_security + 1, sd, size_sd)) { *security_id = d_security->key.sec_id; /* Such security already exists. */ err = 0; goto out; } } err = indx_find_sort(indx_sdh, ni, root_sdh, (struct NTFS_DE **)&e, fnd_sdh); if (err) goto out; if (!e || e->key.hash != hash_key.hash) break; } /* Zero unused space. */ next = sbi->security.next_off & (SecurityDescriptorsBlockSize - 1); left = SecurityDescriptorsBlockSize - next; /* Zero gap until SecurityDescriptorsBlockSize. */ if (left < new_sec_size) { /* Zero "left" bytes from sbi->security.next_off. */ sbi->security.next_off += SecurityDescriptorsBlockSize + left; } /* Zero tail of previous security. */ //used = ni->vfs_inode.i_size & (SecurityDescriptorsBlockSize - 1); /* * Example: * 0x40438 == ni->vfs_inode.i_size * 0x00440 == sbi->security.next_off * need to zero [0x438-0x440) * if (next > used) { * u32 tozero = next - used; * zero "tozero" bytes from sbi->security.next_off - tozero */ /* Format new security descriptor. */ d_security->key.hash = hash_key.hash; d_security->key.sec_id = cpu_to_le32(sbi->security.next_id); d_security->off = cpu_to_le64(sbi->security.next_off); d_security->size = cpu_to_le32(new_sec_size); memcpy(d_security + 1, sd, size_sd); /* Write main SDS bucket. */ err = ntfs_sb_write_run(sbi, &ni->file.run, sbi->security.next_off, d_security, aligned_sec_size, 0); if (err) goto out; mirr_off = sbi->security.next_off + SecurityDescriptorsBlockSize; new_sds_size = mirr_off + aligned_sec_size; if (new_sds_size > ni->vfs_inode.i_size) { err = attr_set_size(ni, ATTR_DATA, SDS_NAME, ARRAY_SIZE(SDS_NAME), &ni->file.run, new_sds_size, &new_sds_size, false, NULL); if (err) goto out; } /* Write copy SDS bucket. */ err = ntfs_sb_write_run(sbi, &ni->file.run, mirr_off, d_security, aligned_sec_size, 0); if (err) goto out; /* Fill SII entry. */ sii_e.de.view.data_off = cpu_to_le16(offsetof(struct NTFS_DE_SII, sec_hdr)); sii_e.de.view.data_size = cpu_to_le16(sizeof(struct SECURITY_HDR)); sii_e.de.view.res = 0; sii_e.de.size = cpu_to_le16(sizeof(struct NTFS_DE_SII)); sii_e.de.key_size = cpu_to_le16(sizeof(d_security->key.sec_id)); sii_e.de.flags = 0; sii_e.de.res = 0; sii_e.sec_id = d_security->key.sec_id; memcpy(&sii_e.sec_hdr, d_security, sizeof(struct SECURITY_HDR)); err = indx_insert_entry(indx_sii, ni, &sii_e.de, NULL, NULL, 0); if (err) goto out; /* Fill SDH entry. */ sdh_e.de.view.data_off = cpu_to_le16(offsetof(struct NTFS_DE_SDH, sec_hdr)); sdh_e.de.view.data_size = cpu_to_le16(sizeof(struct SECURITY_HDR)); sdh_e.de.view.res = 0; sdh_e.de.size = cpu_to_le16(SIZEOF_SDH_DIRENTRY); sdh_e.de.key_size = cpu_to_le16(sizeof(sdh_e.key)); sdh_e.de.flags = 0; sdh_e.de.res = 0; sdh_e.key.hash = d_security->key.hash; sdh_e.key.sec_id = d_security->key.sec_id; memcpy(&sdh_e.sec_hdr, d_security, sizeof(struct SECURITY_HDR)); sdh_e.magic[0] = cpu_to_le16('I'); sdh_e.magic[1] = cpu_to_le16('I'); fnd_clear(fnd_sdh); err = indx_insert_entry(indx_sdh, ni, &sdh_e.de, (void *)(size_t)1, fnd_sdh, 0); if (err) goto out; *security_id = d_security->key.sec_id; if (inserted) *inserted = true; /* Update Id and offset for next descriptor. */ sbi->security.next_id += 1; sbi->security.next_off += aligned_sec_size; out: fnd_put(fnd_sdh); mark_inode_dirty(&ni->vfs_inode); ni_unlock(ni); kfree(d_security); return err; } /* * ntfs_reparse_init - Load and parse $Extend/$Reparse. */ int ntfs_reparse_init(struct ntfs_sb_info *sbi) { int err; struct ntfs_inode *ni = sbi->reparse.ni; struct ntfs_index *indx = &sbi->reparse.index_r; struct ATTRIB *attr; struct ATTR_LIST_ENTRY *le; const struct INDEX_ROOT *root_r; if (!ni) return 0; le = NULL; attr = ni_find_attr(ni, NULL, &le, ATTR_ROOT, SR_NAME, ARRAY_SIZE(SR_NAME), NULL, NULL); if (!attr) { err = -EINVAL; goto out; } root_r = resident_data(attr); if (root_r->type != ATTR_ZERO || root_r->rule != NTFS_COLLATION_TYPE_UINTS) { err = -EINVAL; goto out; } err = indx_init(indx, sbi, attr, INDEX_MUTEX_SR); if (err) goto out; out: return err; } /* * ntfs_objid_init - Load and parse $Extend/$ObjId. */ int ntfs_objid_init(struct ntfs_sb_info *sbi) { int err; struct ntfs_inode *ni = sbi->objid.ni; struct ntfs_index *indx = &sbi->objid.index_o; struct ATTRIB *attr; struct ATTR_LIST_ENTRY *le; const struct INDEX_ROOT *root; if (!ni) return 0; le = NULL; attr = ni_find_attr(ni, NULL, &le, ATTR_ROOT, SO_NAME, ARRAY_SIZE(SO_NAME), NULL, NULL); if (!attr) { err = -EINVAL; goto out; } root = resident_data(attr); if (root->type != ATTR_ZERO || root->rule != NTFS_COLLATION_TYPE_UINTS) { err = -EINVAL; goto out; } err = indx_init(indx, sbi, attr, INDEX_MUTEX_SO); if (err) goto out; out: return err; } int ntfs_objid_remove(struct ntfs_sb_info *sbi, struct GUID *guid) { int err; struct ntfs_inode *ni = sbi->objid.ni; struct ntfs_index *indx = &sbi->objid.index_o; if (!ni) return -EINVAL; mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_OBJID); err = indx_delete_entry(indx, ni, guid, sizeof(*guid), NULL); mark_inode_dirty(&ni->vfs_inode); ni_unlock(ni); return err; } int ntfs_insert_reparse(struct ntfs_sb_info *sbi, __le32 rtag, const struct MFT_REF *ref) { int err; struct ntfs_inode *ni = sbi->reparse.ni; struct ntfs_index *indx = &sbi->reparse.index_r; struct NTFS_DE_R re; if (!ni) return -EINVAL; memset(&re, 0, sizeof(re)); re.de.view.data_off = cpu_to_le16(offsetof(struct NTFS_DE_R, zero)); re.de.size = cpu_to_le16(sizeof(struct NTFS_DE_R)); re.de.key_size = cpu_to_le16(sizeof(re.key)); re.key.ReparseTag = rtag; memcpy(&re.key.ref, ref, sizeof(*ref)); mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_REPARSE); err = indx_insert_entry(indx, ni, &re.de, NULL, NULL, 0); mark_inode_dirty(&ni->vfs_inode); ni_unlock(ni); return err; } int ntfs_remove_reparse(struct ntfs_sb_info *sbi, __le32 rtag, const struct MFT_REF *ref) { int err, diff; struct ntfs_inode *ni = sbi->reparse.ni; struct ntfs_index *indx = &sbi->reparse.index_r; struct ntfs_fnd *fnd = NULL; struct REPARSE_KEY rkey; struct NTFS_DE_R *re; struct INDEX_ROOT *root_r; if (!ni) return -EINVAL; rkey.ReparseTag = rtag; rkey.ref = *ref; mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_REPARSE); if (rtag) { err = indx_delete_entry(indx, ni, &rkey, sizeof(rkey), NULL); goto out1; } fnd = fnd_get(); if (!fnd) { err = -ENOMEM; goto out1; } root_r = indx_get_root(indx, ni, NULL, NULL); if (!root_r) { err = -EINVAL; goto out; } /* 1 - forces to ignore rkey.ReparseTag when comparing keys. */ err = indx_find(indx, ni, root_r, &rkey, sizeof(rkey), (void *)1, &diff, (struct NTFS_DE **)&re, fnd); if (err) goto out; if (memcmp(&re->key.ref, ref, sizeof(*ref))) { /* Impossible. Looks like volume corrupt? */ goto out; } memcpy(&rkey, &re->key, sizeof(rkey)); fnd_put(fnd); fnd = NULL; err = indx_delete_entry(indx, ni, &rkey, sizeof(rkey), NULL); if (err) goto out; out: fnd_put(fnd); out1: mark_inode_dirty(&ni->vfs_inode); ni_unlock(ni); return err; } static inline void ntfs_unmap_and_discard(struct ntfs_sb_info *sbi, CLST lcn, CLST len) { ntfs_unmap_meta(sbi->sb, lcn, len); ntfs_discard(sbi, lcn, len); } void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim) { CLST end, i, zone_len, zlen; struct wnd_bitmap *wnd = &sbi->used.bitmap; bool dirty = false; down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS); if (!wnd_is_used(wnd, lcn, len)) { /* mark volume as dirty out of wnd->rw_lock */ dirty = true; end = lcn + len; len = 0; for (i = lcn; i < end; i++) { if (wnd_is_used(wnd, i, 1)) { if (!len) lcn = i; len += 1; continue; } if (!len) continue; if (trim) ntfs_unmap_and_discard(sbi, lcn, len); wnd_set_free(wnd, lcn, len); len = 0; } if (!len) goto out; } if (trim) ntfs_unmap_and_discard(sbi, lcn, len); wnd_set_free(wnd, lcn, len); /* append to MFT zone, if possible. */ zone_len = wnd_zone_len(wnd); zlen = min(zone_len + len, sbi->zone_max); if (zlen == zone_len) { /* MFT zone already has maximum size. */ } else if (!zone_len) { /* Create MFT zone only if 'zlen' is large enough. */ if (zlen == sbi->zone_max) wnd_zone_set(wnd, lcn, zlen); } else { CLST zone_lcn = wnd_zone_bit(wnd); if (lcn + len == zone_lcn) { /* Append into head MFT zone. */ wnd_zone_set(wnd, lcn, zlen); } else if (zone_lcn + zone_len == lcn) { /* Append into tail MFT zone. */ wnd_zone_set(wnd, zone_lcn, zlen); } } out: up_write(&wnd->rw_lock); if (dirty) ntfs_set_state(sbi, NTFS_DIRTY_ERROR); } /* * run_deallocate - Deallocate clusters. */ int run_deallocate(struct ntfs_sb_info *sbi, const struct runs_tree *run, bool trim) { CLST lcn, len; size_t idx = 0; while (run_get_entry(run, idx++, NULL, &lcn, &len)) { if (lcn == SPARSE_LCN) continue; mark_as_free_ex(sbi, lcn, len, trim); } return 0; } static inline bool name_has_forbidden_chars(const struct le_str *fname) { int i, ch; /* check for forbidden chars */ for (i = 0; i < fname->len; ++i) { ch = le16_to_cpu(fname->name[i]); /* control chars */ if (ch < 0x20) return true; switch (ch) { /* disallowed by Windows */ case '\\': case '/': case ':': case '*': case '?': case '<': case '>': case '|': case '\"': return true; default: /* allowed char */ break; } } /* file names cannot end with space or . */ if (fname->len > 0) { ch = le16_to_cpu(fname->name[fname->len - 1]); if (ch == ' ' || ch == '.') return true; } return false; } static inline bool is_reserved_name(const struct ntfs_sb_info *sbi, const struct le_str *fname) { int port_digit; const __le16 *name = fname->name; int len = fname->len; const u16 *upcase = sbi->upcase; /* check for 3 chars reserved names (device names) */ /* name by itself or with any extension is forbidden */ if (len == 3 || (len > 3 && le16_to_cpu(name[3]) == '.')) if (!ntfs_cmp_names(name, 3, CON_NAME, 3, upcase, false) || !ntfs_cmp_names(name, 3, NUL_NAME, 3, upcase, false) || !ntfs_cmp_names(name, 3, AUX_NAME, 3, upcase, false) || !ntfs_cmp_names(name, 3, PRN_NAME, 3, upcase, false)) return true; /* check for 4 chars reserved names (port name followed by 1..9) */ /* name by itself or with any extension is forbidden */ if (len == 4 || (len > 4 && le16_to_cpu(name[4]) == '.')) { port_digit = le16_to_cpu(name[3]); if (port_digit >= '1' && port_digit <= '9') if (!ntfs_cmp_names(name, 3, COM_NAME, 3, upcase, false) || !ntfs_cmp_names(name, 3, LPT_NAME, 3, upcase, false)) return true; } return false; } /* * valid_windows_name - Check if a file name is valid in Windows. */ bool valid_windows_name(struct ntfs_sb_info *sbi, const struct le_str *fname) { return !name_has_forbidden_chars(fname) && !is_reserved_name(sbi, fname); } /* * ntfs_set_label - updates current ntfs label. */ int ntfs_set_label(struct ntfs_sb_info *sbi, u8 *label, int len) { int err; struct ATTRIB *attr; struct ntfs_inode *ni = sbi->volume.ni; const u8 max_ulen = 0x80; /* TODO: use attrdef to get maximum length */ /* Allocate PATH_MAX bytes. */ struct cpu_str *uni = __getname(); if (!uni) return -ENOMEM; err = ntfs_nls_to_utf16(sbi, label, len, uni, (PATH_MAX - 2) / 2, UTF16_LITTLE_ENDIAN); if (err < 0) goto out; if (uni->len > max_ulen) { ntfs_warn(sbi->sb, "new label is too long"); err = -EFBIG; goto out; } ni_lock(ni); /* Ignore any errors. */ ni_remove_attr(ni, ATTR_LABEL, NULL, 0, false, NULL); err = ni_insert_resident(ni, uni->len * sizeof(u16), ATTR_LABEL, NULL, 0, &attr, NULL, NULL); if (err < 0) goto unlock_out; /* write new label in on-disk struct. */ memcpy(resident_data(attr), uni->name, uni->len * sizeof(u16)); /* update cached value of current label. */ if (len >= ARRAY_SIZE(sbi->volume.label)) len = ARRAY_SIZE(sbi->volume.label) - 1; memcpy(sbi->volume.label, label, len); sbi->volume.label[len] = 0; mark_inode_dirty_sync(&ni->vfs_inode); unlock_out: ni_unlock(ni); if (!err) err = _ni_write_inode(&ni->vfs_inode, 0); out: __putname(uni); return err; }
71 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 // SPDX-License-Identifier: GPL-2.0-or-later /* * Information interface for ALSA driver * Copyright (c) by Jaroslav Kysela <perex@perex.cz> */ #include <linux/slab.h> #include <linux/time.h> #include <linux/string.h> #include <linux/export.h> #include <sound/core.h> #include <sound/minors.h> #include <sound/info.h> #include <linux/utsname.h> #include <linux/mutex.h> /* * OSS compatible part */ static DEFINE_MUTEX(strings); static char *snd_sndstat_strings[SNDRV_CARDS][SNDRV_OSS_INFO_DEV_COUNT]; int snd_oss_info_register(int dev, int num, char *string) { char *x; if (snd_BUG_ON(dev < 0 || dev >= SNDRV_OSS_INFO_DEV_COUNT)) return -ENXIO; if (snd_BUG_ON(num < 0 || num >= SNDRV_CARDS)) return -ENXIO; mutex_lock(&strings); if (string == NULL) { x = snd_sndstat_strings[num][dev]; kfree(x); x = NULL; } else { x = kstrdup(string, GFP_KERNEL); if (x == NULL) { mutex_unlock(&strings); return -ENOMEM; } } snd_sndstat_strings[num][dev] = x; mutex_unlock(&strings); return 0; } EXPORT_SYMBOL(snd_oss_info_register); static int snd_sndstat_show_strings(struct snd_info_buffer *buf, char *id, int dev) { int idx, ok = -1; char *str; snd_iprintf(buf, "\n%s:", id); mutex_lock(&strings); for (idx = 0; idx < SNDRV_CARDS; idx++) { str = snd_sndstat_strings[idx][dev]; if (str) { if (ok < 0) { snd_iprintf(buf, "\n"); ok++; } snd_iprintf(buf, "%i: %s\n", idx, str); } } mutex_unlock(&strings); if (ok < 0) snd_iprintf(buf, " NOT ENABLED IN CONFIG\n"); return ok; } static void snd_sndstat_proc_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { snd_iprintf(buffer, "Sound Driver:3.8.1a-980706 (ALSA emulation code)\n"); snd_iprintf(buffer, "Kernel: %s %s %s %s %s\n", init_utsname()->sysname, init_utsname()->nodename, init_utsname()->release, init_utsname()->version, init_utsname()->machine); snd_iprintf(buffer, "Config options: 0\n"); snd_iprintf(buffer, "\nInstalled drivers: \n"); snd_iprintf(buffer, "Type 10: ALSA emulation\n"); snd_iprintf(buffer, "\nCard config: \n"); snd_card_info_read_oss(buffer); snd_sndstat_show_strings(buffer, "Audio devices", SNDRV_OSS_INFO_DEV_AUDIO); snd_sndstat_show_strings(buffer, "Synth devices", SNDRV_OSS_INFO_DEV_SYNTH); snd_sndstat_show_strings(buffer, "Midi devices", SNDRV_OSS_INFO_DEV_MIDI); snd_sndstat_show_strings(buffer, "Timers", SNDRV_OSS_INFO_DEV_TIMERS); snd_sndstat_show_strings(buffer, "Mixers", SNDRV_OSS_INFO_DEV_MIXERS); } int __init snd_info_minor_register(void) { struct snd_info_entry *entry; memset(snd_sndstat_strings, 0, sizeof(snd_sndstat_strings)); entry = snd_info_create_module_entry(THIS_MODULE, "sndstat", snd_oss_root); if (!entry) return -ENOMEM; entry->c.text.read = snd_sndstat_proc_read; return snd_info_register(entry); /* freed in error path */ }
1 22 22 21 1 98 73 71 17 1 2 23 23 17 17 24 17 17 57 21 22 20 4 26 35 18 23 18 33 17 59 26 18 1 1 1 25 25 24 1 2 23 23 21 21 1 1 2 1 1 1 22 23 23 23 21 1 21 21 21 21 21 21 21 21 21 21 21 21 21 22 23 25 21 22 25 23 2 1 27 1 25 1 2 22 98 77 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 // SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * cpuid support routines * * derived from arch/x86/kvm/x86.c * * Copyright 2011 Red Hat, Inc. and/or its affiliates. * Copyright IBM Corporation, 2008 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include "linux/lockdep.h" #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> #include <linux/sched/stat.h> #include <asm/processor.h> #include <asm/user.h> #include <asm/fpu/xstate.h> #include <asm/sgx.h> #include <asm/cpuid.h> #include "cpuid.h" #include "lapic.h" #include "mmu.h" #include "trace.h" #include "pmu.h" #include "xen.h" /* * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be * aligned to sizeof(unsigned long) because it's not accessed via bitops. */ u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; EXPORT_SYMBOL_GPL(kvm_cpu_caps); u32 xstate_required_size(u64 xstate_bv, bool compacted) { int feature_bit = 0; u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; xstate_bv &= XFEATURE_MASK_EXTEND; while (xstate_bv) { if (xstate_bv & 0x1) { u32 eax, ebx, ecx, edx, offset; cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx); /* ECX[1]: 64B alignment in compacted form */ if (compacted) offset = (ecx & 0x2) ? ALIGN(ret, 64) : ret; else offset = ebx; ret = max(ret, offset + eax); } xstate_bv >>= 1; feature_bit++; } return ret; } #define F feature_bit /* Scattered Flag - For features that are scattered by cpufeatures.h. */ #define SF(name) \ ({ \ BUILD_BUG_ON(X86_FEATURE_##name >= MAX_CPU_FEATURES); \ (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0); \ }) /* * Magic value used by KVM when querying userspace-provided CPUID entries and * doesn't care about the CPIUD index because the index of the function in * question is not significant. Note, this magic value must have at least one * bit set in bits[63:32] and must be consumed as a u64 by cpuid_entry2_find() * to avoid false positives when processing guest CPUID input. */ #define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index) { struct kvm_cpuid_entry2 *e; int i; /* * KVM has a semi-arbitrary rule that querying the guest's CPUID model * with IRQs disabled is disallowed. The CPUID model can legitimately * have over one hundred entries, i.e. the lookup is slow, and IRQs are * typically disabled in KVM only when KVM is in a performance critical * path, e.g. the core VM-Enter/VM-Exit run loop. Nothing will break * if this rule is violated, this assertion is purely to flag potential * performance issues. If this fires, consider moving the lookup out * of the hotpath, e.g. by caching information during CPUID updates. */ lockdep_assert_irqs_enabled(); for (i = 0; i < nent; i++) { e = &entries[i]; if (e->function != function) continue; /* * If the index isn't significant, use the first entry with a * matching function. It's userspace's responsibility to not * provide "duplicate" entries in all cases. */ if (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index) return e; /* * Similarly, use the first matching entry if KVM is doing a * lookup (as opposed to emulating CPUID) for a function that's * architecturally defined as not having a significant index. */ if (index == KVM_CPUID_INDEX_NOT_SIGNIFICANT) { /* * Direct lookups from KVM should not diverge from what * KVM defines internally (the architectural behavior). */ WARN_ON_ONCE(cpuid_function_is_indexed(function)); return e; } } return NULL; } static int kvm_check_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries, int nent) { struct kvm_cpuid_entry2 *best; u64 xfeatures; /* * The existing code assumes virtual address is 48-bit or 57-bit in the * canonical address checks; exit if it is ever changed. */ best = cpuid_entry2_find(entries, nent, 0x80000008, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) { int vaddr_bits = (best->eax & 0xff00) >> 8; if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0) return -EINVAL; } /* * Exposing dynamic xfeatures to the guest requires additional * enabling in the FPU, e.g. to expand the guest XSAVE state size. */ best = cpuid_entry2_find(entries, nent, 0xd, 0); if (!best) return 0; xfeatures = best->eax | ((u64)best->edx << 32); xfeatures &= XFEATURE_MASK_USER_DYNAMIC; if (!xfeatures) return 0; return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures); } /* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, int nent) { struct kvm_cpuid_entry2 *orig; int i; if (nent != vcpu->arch.cpuid_nent) return -EINVAL; for (i = 0; i < nent; i++) { orig = &vcpu->arch.cpuid_entries[i]; if (e2[i].function != orig->function || e2[i].index != orig->index || e2[i].flags != orig->flags || e2[i].eax != orig->eax || e2[i].ebx != orig->ebx || e2[i].ecx != orig->ecx || e2[i].edx != orig->edx) return -EINVAL; } return 0; } static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu, const char *sig) { struct kvm_hypervisor_cpuid cpuid = {}; struct kvm_cpuid_entry2 *entry; u32 base; for_each_possible_hypervisor_cpuid_base(base) { entry = kvm_find_cpuid_entry(vcpu, base); if (entry) { u32 signature[3]; signature[0] = entry->ebx; signature[1] = entry->ecx; signature[2] = entry->edx; if (!memcmp(signature, sig, sizeof(signature))) { cpuid.base = base; cpuid.limit = entry->eax; break; } } } return cpuid; } static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries, int nent) { u32 base = vcpu->arch.kvm_cpuid.base; if (!base) return NULL; return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, KVM_CPUID_INDEX_NOT_SIGNIFICANT); } static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) { return __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); } void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best = kvm_find_kvm_cpuid_features(vcpu); /* * save the feature bitmap to avoid cpuid lookup for every PV * operation */ if (best) vcpu->arch.pv_cpuid.features = best->eax; } /* * Calculate guest's supported XCR0 taking into account guest CPUID data and * KVM's supported XCR0 (comprised of host's XCR0 and KVM_SUPPORTED_XCR0). */ static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent) { struct kvm_cpuid_entry2 *best; best = cpuid_entry2_find(entries, nent, 0xd, 0); if (!best) return 0; return (best->eax | ((u64)best->edx << 32)) & kvm_caps.supported_xcr0; } static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries, int nent) { struct kvm_cpuid_entry2 *best; best = cpuid_entry2_find(entries, nent, 1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) { /* Update OSXSAVE bit */ if (boot_cpu_has(X86_FEATURE_XSAVE)) cpuid_entry_change(best, X86_FEATURE_OSXSAVE, kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)); cpuid_entry_change(best, X86_FEATURE_APIC, vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE); } best = cpuid_entry2_find(entries, nent, 7, 0); if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) cpuid_entry_change(best, X86_FEATURE_OSPKE, kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE)); best = cpuid_entry2_find(entries, nent, 0xD, 0); if (best) best->ebx = xstate_required_size(vcpu->arch.xcr0, false); best = cpuid_entry2_find(entries, nent, 0xD, 1); if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) || cpuid_entry_has(best, X86_FEATURE_XSAVEC))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); best = __kvm_find_kvm_cpuid_features(vcpu, entries, nent); if (kvm_hlt_in_guest(vcpu->kvm) && best && (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { best = cpuid_entry2_find(entries, nent, 0x1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) cpuid_entry_change(best, X86_FEATURE_MWAIT, vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT); } } void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) { __kvm_update_cpuid_runtime(vcpu, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); } EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime); static bool kvm_cpuid_has_hyperv(struct kvm_cpuid_entry2 *entries, int nent) { struct kvm_cpuid_entry2 *entry; entry = cpuid_entry2_find(entries, nent, HYPERV_CPUID_INTERFACE, KVM_CPUID_INDEX_NOT_SIGNIFICANT); return entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX; } static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_cpuid_entry2 *best; bool allow_gbpages; BUILD_BUG_ON(KVM_NR_GOVERNED_FEATURES > KVM_MAX_NR_GOVERNED_FEATURES); bitmap_zero(vcpu->arch.governed_features.enabled, KVM_MAX_NR_GOVERNED_FEATURES); /* * If TDP is enabled, let the guest use GBPAGES if they're supported in * hardware. The hardware page walker doesn't let KVM disable GBPAGES, * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA * walk for performance and complexity reasons. Not to mention KVM * _can't_ solve the problem because GVA->GPA walks aren't visible to * KVM once a TDP translation is installed. Mimic hardware behavior so * that KVM's is at least consistent, i.e. doesn't randomly inject #PF. * If TDP is disabled, honor *only* guest CPUID as KVM has full control * and can install smaller shadow pages if the host lacks 1GiB support. */ allow_gbpages = tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) : guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES); if (allow_gbpages) kvm_governed_feature_set(vcpu, X86_FEATURE_GBPAGES); best = kvm_find_cpuid_entry(vcpu, 1); if (best && apic) { if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) apic->lapic_timer.timer_mode_mask = 3 << 17; else apic->lapic_timer.timer_mode_mask = 1 << 17; kvm_apic_set_version(vcpu); } vcpu->arch.guest_supported_xcr0 = cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); kvm_update_pv_runtime(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); kvm_pmu_refresh(vcpu); vcpu->arch.cr4_guest_rsvd_bits = __cr4_reserved_bits(guest_cpuid_has, vcpu); kvm_hv_set_cpuid(vcpu, kvm_cpuid_has_hyperv(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent)); /* Invoke the vendor callback only after the above state is updated. */ static_call(kvm_x86_vcpu_after_set_cpuid)(vcpu); /* * Except for the MMU, which needs to do its thing any vendor specific * adjustments to the reserved GPA bits. */ kvm_mmu_after_set_cpuid(vcpu); } int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; best = kvm_find_cpuid_entry(vcpu, 0x80000000); if (!best || best->eax < 0x80000008) goto not_found; best = kvm_find_cpuid_entry(vcpu, 0x80000008); if (best) return best->eax & 0xff; not_found: return 36; } /* * This "raw" version returns the reserved GPA bits without any adjustments for * encryption technologies that usurp bits. The raw mask should be used if and * only if hardware does _not_ strip the usurped bits, e.g. in virtual MTRRs. */ u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu) { return rsvd_bits(cpuid_maxphyaddr(vcpu), 63); } static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, int nent) { int r; __kvm_update_cpuid_runtime(vcpu, e2, nent); /* * KVM does not correctly handle changing guest CPUID after KVM_RUN, as * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with * the core vCPU model on the fly. It would've been better to forbid any * KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately * some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do * KVM_SET_CPUID{,2} again. To support this legacy behavior, check * whether the supplied CPUID data is equal to what's already set. */ if (kvm_vcpu_has_run(vcpu)) { r = kvm_cpuid_check_equal(vcpu, e2, nent); if (r) return r; kvfree(e2); return 0; } if (kvm_cpuid_has_hyperv(e2, nent)) { r = kvm_hv_vcpu_init(vcpu); if (r) return r; } r = kvm_check_cpuid(vcpu, e2, nent); if (r) return r; kvfree(vcpu->arch.cpuid_entries); vcpu->arch.cpuid_entries = e2; vcpu->arch.cpuid_nent = nent; vcpu->arch.kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE); #ifdef CONFIG_KVM_XEN vcpu->arch.xen.cpuid = kvm_get_hypervisor_cpuid(vcpu, XEN_SIGNATURE); #endif kvm_vcpu_after_set_cpuid(vcpu); return 0; } /* when an old userspace process fills a new kernel module */ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid *cpuid, struct kvm_cpuid_entry __user *entries) { int r, i; struct kvm_cpuid_entry *e = NULL; struct kvm_cpuid_entry2 *e2 = NULL; if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) return -E2BIG; if (cpuid->nent) { e = vmemdup_user(entries, array_size(sizeof(*e), cpuid->nent)); if (IS_ERR(e)) return PTR_ERR(e); e2 = kvmalloc_array(cpuid->nent, sizeof(*e2), GFP_KERNEL_ACCOUNT); if (!e2) { r = -ENOMEM; goto out_free_cpuid; } } for (i = 0; i < cpuid->nent; i++) { e2[i].function = e[i].function; e2[i].eax = e[i].eax; e2[i].ebx = e[i].ebx; e2[i].ecx = e[i].ecx; e2[i].edx = e[i].edx; e2[i].index = 0; e2[i].flags = 0; e2[i].padding[0] = 0; e2[i].padding[1] = 0; e2[i].padding[2] = 0; } r = kvm_set_cpuid(vcpu, e2, cpuid->nent); if (r) kvfree(e2); out_free_cpuid: kvfree(e); return r; } int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries) { struct kvm_cpuid_entry2 *e2 = NULL; int r; if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) return -E2BIG; if (cpuid->nent) { e2 = vmemdup_user(entries, array_size(sizeof(*e2), cpuid->nent)); if (IS_ERR(e2)) return PTR_ERR(e2); } r = kvm_set_cpuid(vcpu, e2, cpuid->nent); if (r) kvfree(e2); return r; } int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries) { if (cpuid->nent < vcpu->arch.cpuid_nent) return -E2BIG; if (copy_to_user(entries, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) return -EFAULT; cpuid->nent = vcpu->arch.cpuid_nent; return 0; } /* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */ static __always_inline void __kvm_cpu_cap_mask(unsigned int leaf) { const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32); struct kvm_cpuid_entry2 entry; reverse_cpuid_check(leaf); cpuid_count(cpuid.function, cpuid.index, &entry.eax, &entry.ebx, &entry.ecx, &entry.edx); kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg); } static __always_inline void kvm_cpu_cap_init_kvm_defined(enum kvm_only_cpuid_leafs leaf, u32 mask) { /* Use kvm_cpu_cap_mask for leafs that aren't KVM-only. */ BUILD_BUG_ON(leaf < NCAPINTS); kvm_cpu_caps[leaf] = mask; __kvm_cpu_cap_mask(leaf); } static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask) { /* Use kvm_cpu_cap_init_kvm_defined for KVM-only leafs. */ BUILD_BUG_ON(leaf >= NCAPINTS); kvm_cpu_caps[leaf] &= mask; __kvm_cpu_cap_mask(leaf); } void kvm_set_cpu_caps(void) { #ifdef CONFIG_X86_64 unsigned int f_gbpages = F(GBPAGES); unsigned int f_lm = F(LM); unsigned int f_xfd = F(XFD); #else unsigned int f_gbpages = 0; unsigned int f_lm = 0; unsigned int f_xfd = 0; #endif memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps)); BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) > sizeof(boot_cpu_data.x86_capability)); memcpy(&kvm_cpu_caps, &boot_cpu_data.x86_capability, sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps))); kvm_cpu_cap_mask(CPUID_1_ECX, /* * NOTE: MONITOR (and MWAIT) are emulated as NOP, but *not* * advertised to guests via CPUID! */ F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | 0 /* DS-CPL, VMX, SMX, EST */ | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | F(FMA) | F(CX16) | 0 /* xTPR Update */ | F(PDCM) | F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | F(F16C) | F(RDRAND) ); /* KVM emulates x2apic in software irrespective of host support. */ kvm_cpu_cap_set(X86_FEATURE_X2APIC); kvm_cpu_cap_mask(CPUID_1_EDX, F(FPU) | F(VME) | F(DE) | F(PSE) | F(TSC) | F(MSR) | F(PAE) | F(MCE) | F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) | 0 /* Reserved, DS, ACPI */ | F(MMX) | F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | 0 /* HTT, TM, Reserved, PBE */ ); kvm_cpu_cap_mask(CPUID_7_0_EBX, F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) | F(FDP_EXCPTN_ONLY) | F(SMEP) | F(BMI2) | F(ERMS) | F(INVPCID) | F(RTM) | F(ZERO_FCS_FDS) | 0 /*MPX*/ | F(AVX512F) | F(AVX512DQ) | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) | F(CLFLUSHOPT) | F(CLWB) | 0 /*INTEL_PT*/ | F(AVX512PF) | F(AVX512ER) | F(AVX512CD) | F(SHA_NI) | F(AVX512BW) | F(AVX512VL)); kvm_cpu_cap_mask(CPUID_7_ECX, F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ | F(SGX_LC) | F(BUS_LOCK_DETECT) ); /* Set LA57 based on hardware capability. */ if (cpuid_ecx(7) & F(LA57)) kvm_cpu_cap_set(X86_FEATURE_LA57); /* * PKU not yet implemented for shadow paging and requires OSPKE * to be set on the host. Clear it if that is not the case */ if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) kvm_cpu_cap_clear(X86_FEATURE_PKU); kvm_cpu_cap_mask(CPUID_7_EDX, F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) | F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) | F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D) ); /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */ kvm_cpu_cap_set(X86_FEATURE_TSC_ADJUST); kvm_cpu_cap_set(X86_FEATURE_ARCH_CAPABILITIES); if (boot_cpu_has(X86_FEATURE_IBPB) && boot_cpu_has(X86_FEATURE_IBRS)) kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL); if (boot_cpu_has(X86_FEATURE_STIBP)) kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP); if (boot_cpu_has(X86_FEATURE_AMD_SSBD)) kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); kvm_cpu_cap_mask(CPUID_7_1_EAX, F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) | F(FZRM) | F(FSRS) | F(FSRC) | F(AMX_FP16) | F(AVX_IFMA) ); kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX, F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) | F(AMX_COMPLEX) ); kvm_cpu_cap_mask(CPUID_D_1_EAX, F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd ); kvm_cpu_cap_init_kvm_defined(CPUID_12_EAX, SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA) ); kvm_cpu_cap_mask(CPUID_8000_0001_ECX, F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) | F(TOPOEXT) | 0 /* PERFCTR_CORE */ ); kvm_cpu_cap_mask(CPUID_8000_0001_EDX, F(FPU) | F(VME) | F(DE) | F(PSE) | F(TSC) | F(MSR) | F(PAE) | F(MCE) | F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | F(PAT) | F(PSE36) | 0 /* Reserved */ | F(NX) | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | F(FXSR) | F(FXSR_OPT) | f_gbpages | F(RDTSCP) | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW) ); if (!tdp_enabled && IS_ENABLED(CONFIG_X86_64)) kvm_cpu_cap_set(X86_FEATURE_GBPAGES); kvm_cpu_cap_init_kvm_defined(CPUID_8000_0007_EDX, SF(CONSTANT_TSC) ); kvm_cpu_cap_mask(CPUID_8000_0008_EBX, F(CLZERO) | F(XSAVEERPTR) | F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) | F(AMD_PSFD) ); /* * AMD has separate bits for each SPEC_CTRL bit. * arch/x86/kernel/cpu/bugs.c is kind enough to * record that in cpufeatures so use them. */ if (boot_cpu_has(X86_FEATURE_IBPB)) kvm_cpu_cap_set(X86_FEATURE_AMD_IBPB); if (boot_cpu_has(X86_FEATURE_IBRS)) kvm_cpu_cap_set(X86_FEATURE_AMD_IBRS); if (boot_cpu_has(X86_FEATURE_STIBP)) kvm_cpu_cap_set(X86_FEATURE_AMD_STIBP); if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD)) kvm_cpu_cap_set(X86_FEATURE_AMD_SSBD); if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) kvm_cpu_cap_set(X86_FEATURE_AMD_SSB_NO); /* * The preference is to use SPEC CTRL MSR instead of the * VIRT_SPEC MSR. */ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) && !boot_cpu_has(X86_FEATURE_AMD_SSBD)) kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); /* * Hide all SVM features by default, SVM will set the cap bits for * features it emulates and/or exposes for L1. */ kvm_cpu_cap_mask(CPUID_8000_000A_EDX, 0); kvm_cpu_cap_mask(CPUID_8000_001F_EAX, 0 /* SME */ | F(SEV) | 0 /* VM_PAGE_FLUSH */ | F(SEV_ES) | F(SME_COHERENT)); kvm_cpu_cap_mask(CPUID_8000_0021_EAX, F(NO_NESTED_DATA_BP) | F(LFENCE_RDTSC) | 0 /* SmmPgCfgLock */ | F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ | F(WRMSR_XX_BASE_NS) ); kvm_cpu_cap_check_and_set(X86_FEATURE_SBPB); kvm_cpu_cap_check_and_set(X86_FEATURE_IBPB_BRTYPE); kvm_cpu_cap_check_and_set(X86_FEATURE_SRSO_NO); kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX, F(PERFMON_V2) ); /* * Synthesize "LFENCE is serializing" into the AMD-defined entry in * KVM's supported CPUID if the feature is reported as supported by the * kernel. LFENCE_RDTSC was a Linux-defined synthetic feature long * before AMD joined the bandwagon, e.g. LFENCE is serializing on most * CPUs that support SSE2. On CPUs that don't support AMD's leaf, * kvm_cpu_cap_mask() will unfortunately drop the flag due to ANDing * the mask with the raw host CPUID, and reporting support in AMD's * leaf can make it easier for userspace to detect the feature. */ if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) kvm_cpu_cap_set(X86_FEATURE_LFENCE_RDTSC); if (!static_cpu_has_bug(X86_BUG_NULL_SEG)) kvm_cpu_cap_set(X86_FEATURE_NULL_SEL_CLR_BASE); kvm_cpu_cap_set(X86_FEATURE_NO_SMM_CTL_MSR); kvm_cpu_cap_mask(CPUID_C000_0001_EDX, F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | F(PMM) | F(PMM_EN) ); /* * Hide RDTSCP and RDPID if either feature is reported as supported but * probing MSR_TSC_AUX failed. This is purely a sanity check and * should never happen, but the guest will likely crash if RDTSCP or * RDPID is misreported, and KVM has botched MSR_TSC_AUX emulation in * the past. For example, the sanity check may fire if this instance of * KVM is running as L1 on top of an older, broken KVM. */ if (WARN_ON((kvm_cpu_cap_has(X86_FEATURE_RDTSCP) || kvm_cpu_cap_has(X86_FEATURE_RDPID)) && !kvm_is_supported_user_return_msr(MSR_TSC_AUX))) { kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); kvm_cpu_cap_clear(X86_FEATURE_RDPID); } } EXPORT_SYMBOL_GPL(kvm_set_cpu_caps); struct kvm_cpuid_array { struct kvm_cpuid_entry2 *entries; int maxnent; int nent; }; static struct kvm_cpuid_entry2 *get_next_cpuid(struct kvm_cpuid_array *array) { if (array->nent >= array->maxnent) return NULL; return &array->entries[array->nent++]; } static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array, u32 function, u32 index) { struct kvm_cpuid_entry2 *entry = get_next_cpuid(array); if (!entry) return NULL; memset(entry, 0, sizeof(*entry)); entry->function = function; entry->index = index; switch (function & 0xC0000000) { case 0x40000000: /* Hypervisor leaves are always synthesized by __do_cpuid_func. */ return entry; case 0x80000000: /* * 0x80000021 is sometimes synthesized by __do_cpuid_func, which * would result in out-of-bounds calls to do_host_cpuid. */ { static int max_cpuid_80000000; if (!READ_ONCE(max_cpuid_80000000)) WRITE_ONCE(max_cpuid_80000000, cpuid_eax(0x80000000)); if (function > READ_ONCE(max_cpuid_80000000)) return entry; } break; default: break; } cpuid_count(entry->function, entry->index, &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); if (cpuid_function_is_indexed(function)) entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; return entry; } static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func) { struct kvm_cpuid_entry2 *entry; if (array->nent >= array->maxnent) return -E2BIG; entry = &array->entries[array->nent]; entry->function = func; entry->index = 0; entry->flags = 0; switch (func) { case 0: entry->eax = 7; ++array->nent; break; case 1: entry->ecx = F(MOVBE); ++array->nent; break; case 7: entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; entry->eax = 0; if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) entry->ecx = F(RDPID); ++array->nent; break; default: break; } return 0; } static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) { struct kvm_cpuid_entry2 *entry; int r, i, max_idx; /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); r = -E2BIG; entry = do_host_cpuid(array, function, 0); if (!entry) goto out; switch (function) { case 0: /* Limited to the highest leaf implemented in KVM. */ entry->eax = min(entry->eax, 0x1fU); break; case 1: cpuid_entry_override(entry, CPUID_1_EDX); cpuid_entry_override(entry, CPUID_1_ECX); break; case 2: /* * On ancient CPUs, function 2 entries are STATEFUL. That is, * CPUID(function=2, index=0) may return different results each * time, with the least-significant byte in EAX enumerating the * number of times software should do CPUID(2, 0). * * Modern CPUs, i.e. every CPU KVM has *ever* run on are less * idiotic. Intel's SDM states that EAX & 0xff "will always * return 01H. Software should ignore this value and not * interpret it as an informational descriptor", while AMD's * APM states that CPUID(2) is reserved. * * WARN if a frankenstein CPU that supports virtualization and * a stateful CPUID.0x2 is encountered. */ WARN_ON_ONCE((entry->eax & 0xff) > 1); break; /* functions 4 and 0x8000001d have additional index. */ case 4: case 0x8000001d: /* * Read entries until the cache type in the previous entry is * zero, i.e. indicates an invalid entry. */ for (i = 1; entry->eax & 0x1f; ++i) { entry = do_host_cpuid(array, function, i); if (!entry) goto out; } break; case 6: /* Thermal management */ entry->eax = 0x4; /* allow ARAT */ entry->ebx = 0; entry->ecx = 0; entry->edx = 0; break; /* function 7 has additional index. */ case 7: entry->eax = min(entry->eax, 1u); cpuid_entry_override(entry, CPUID_7_0_EBX); cpuid_entry_override(entry, CPUID_7_ECX); cpuid_entry_override(entry, CPUID_7_EDX); /* KVM only supports 0x7.0 and 0x7.1, capped above via min(). */ if (entry->eax == 1) { entry = do_host_cpuid(array, function, 1); if (!entry) goto out; cpuid_entry_override(entry, CPUID_7_1_EAX); cpuid_entry_override(entry, CPUID_7_1_EDX); entry->ebx = 0; entry->ecx = 0; } break; case 0xa: { /* Architectural Performance Monitoring */ union cpuid10_eax eax; union cpuid10_edx edx; if (!enable_pmu || !static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } eax.split.version_id = kvm_pmu_cap.version; eax.split.num_counters = kvm_pmu_cap.num_counters_gp; eax.split.bit_width = kvm_pmu_cap.bit_width_gp; eax.split.mask_length = kvm_pmu_cap.events_mask_len; edx.split.num_counters_fixed = kvm_pmu_cap.num_counters_fixed; edx.split.bit_width_fixed = kvm_pmu_cap.bit_width_fixed; if (kvm_pmu_cap.version) edx.split.anythread_deprecated = 1; edx.split.reserved1 = 0; edx.split.reserved2 = 0; entry->eax = eax.full; entry->ebx = kvm_pmu_cap.events_mask; entry->ecx = 0; entry->edx = edx.full; break; } case 0x1f: case 0xb: /* * No topology; a valid topology is indicated by the presence * of subleaf 1. */ entry->eax = entry->ebx = entry->ecx = 0; break; case 0xd: { u64 permitted_xcr0 = kvm_get_filtered_xcr0(); u64 permitted_xss = kvm_caps.supported_xss; entry->eax &= permitted_xcr0; entry->ebx = xstate_required_size(permitted_xcr0, false); entry->ecx = entry->ebx; entry->edx &= permitted_xcr0 >> 32; if (!permitted_xcr0) break; entry = do_host_cpuid(array, function, 1); if (!entry) goto out; cpuid_entry_override(entry, CPUID_D_1_EAX); if (entry->eax & (F(XSAVES)|F(XSAVEC))) entry->ebx = xstate_required_size(permitted_xcr0 | permitted_xss, true); else { WARN_ON_ONCE(permitted_xss != 0); entry->ebx = 0; } entry->ecx &= permitted_xss; entry->edx &= permitted_xss >> 32; for (i = 2; i < 64; ++i) { bool s_state; if (permitted_xcr0 & BIT_ULL(i)) s_state = false; else if (permitted_xss & BIT_ULL(i)) s_state = true; else continue; entry = do_host_cpuid(array, function, i); if (!entry) goto out; /* * The supported check above should have filtered out * invalid sub-leafs. Only valid sub-leafs should * reach this point, and they should have a non-zero * save state size. Furthermore, check whether the * processor agrees with permitted_xcr0/permitted_xss * on whether this is an XCR0- or IA32_XSS-managed area. */ if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 0x1) != s_state)) { --array->nent; continue; } if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) entry->ecx &= ~BIT_ULL(2); entry->edx = 0; } break; } case 0x12: /* Intel SGX */ if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } /* * Index 0: Sub-features, MISCSELECT (a.k.a extended features) * and max enclave sizes. The SGX sub-features and MISCSELECT * are restricted by kernel and KVM capabilities (like most * feature flags), while enclave size is unrestricted. */ cpuid_entry_override(entry, CPUID_12_EAX); entry->ebx &= SGX_MISC_EXINFO; entry = do_host_cpuid(array, function, 1); if (!entry) goto out; /* * Index 1: SECS.ATTRIBUTES. ATTRIBUTES are restricted a la * feature flags. Advertise all supported flags, including * privileged attributes that require explicit opt-in from * userspace. ATTRIBUTES.XFRM is not adjusted as userspace is * expected to derive it from supported XCR0. */ entry->eax &= SGX_ATTR_PRIV_MASK | SGX_ATTR_UNPRIV_MASK; entry->ebx &= 0; break; /* Intel PT */ case 0x14: if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) { if (!do_host_cpuid(array, function, i)) goto out; } break; /* Intel AMX TILE */ case 0x1d: if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) { if (!do_host_cpuid(array, function, i)) goto out; } break; case 0x1e: /* TMUL information */ if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } break; case KVM_CPUID_SIGNATURE: { const u32 *sigptr = (const u32 *)KVM_SIGNATURE; entry->eax = KVM_CPUID_FEATURES; entry->ebx = sigptr[0]; entry->ecx = sigptr[1]; entry->edx = sigptr[2]; break; } case KVM_CPUID_FEATURES: entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | (1 << KVM_FEATURE_NOP_IO_DELAY) | (1 << KVM_FEATURE_CLOCKSOURCE2) | (1 << KVM_FEATURE_ASYNC_PF) | (1 << KVM_FEATURE_PV_EOI) | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | (1 << KVM_FEATURE_PV_UNHALT) | (1 << KVM_FEATURE_PV_TLB_FLUSH) | (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) | (1 << KVM_FEATURE_PV_SEND_IPI) | (1 << KVM_FEATURE_POLL_CONTROL) | (1 << KVM_FEATURE_PV_SCHED_YIELD) | (1 << KVM_FEATURE_ASYNC_PF_INT); if (sched_info_on()) entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); entry->ebx = 0; entry->ecx = 0; entry->edx = 0; break; case 0x80000000: entry->eax = min(entry->eax, 0x80000022); /* * Serializing LFENCE is reported in a multitude of ways, and * NullSegClearsBase is not reported in CPUID on Zen2; help * userspace by providing the CPUID leaf ourselves. * * However, only do it if the host has CPUID leaf 0x8000001d. * QEMU thinks that it can query the host blindly for that * CPUID leaf if KVM reports that it supports 0x8000001d or * above. The processor merrily returns values from the * highest Intel leaf which QEMU tries to use as the guest's * 0x8000001d. Even worse, this can result in an infinite * loop if said highest leaf has no subleaves indexed by ECX. */ if (entry->eax >= 0x8000001d && (static_cpu_has(X86_FEATURE_LFENCE_RDTSC) || !static_cpu_has_bug(X86_BUG_NULL_SEG))) entry->eax = max(entry->eax, 0x80000021); break; case 0x80000001: entry->ebx &= ~GENMASK(27, 16); cpuid_entry_override(entry, CPUID_8000_0001_EDX); cpuid_entry_override(entry, CPUID_8000_0001_ECX); break; case 0x80000005: /* Pass host L1 cache and TLB info. */ break; case 0x80000006: /* Drop reserved bits, pass host L2 cache and TLB info. */ entry->edx &= ~GENMASK(17, 16); break; case 0x80000007: /* Advanced power management */ cpuid_entry_override(entry, CPUID_8000_0007_EDX); /* mask against host */ entry->edx &= boot_cpu_data.x86_power; entry->eax = entry->ebx = entry->ecx = 0; break; case 0x80000008: { unsigned g_phys_as = (entry->eax >> 16) & 0xff; unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); unsigned phys_as = entry->eax & 0xff; /* * If TDP (NPT) is disabled use the adjusted host MAXPHYADDR as * the guest operates in the same PA space as the host, i.e. * reductions in MAXPHYADDR for memory encryption affect shadow * paging, too. * * If TDP is enabled but an explicit guest MAXPHYADDR is not * provided, use the raw bare metal MAXPHYADDR as reductions to * the HPAs do not affect GPAs. */ if (!tdp_enabled) g_phys_as = boot_cpu_data.x86_phys_bits; else if (!g_phys_as) g_phys_as = phys_as; entry->eax = g_phys_as | (virt_as << 8); entry->ecx &= ~(GENMASK(31, 16) | GENMASK(11, 8)); entry->edx = 0; cpuid_entry_override(entry, CPUID_8000_0008_EBX); break; } case 0x8000000A: if (!kvm_cpu_cap_has(X86_FEATURE_SVM)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } entry->eax = 1; /* SVM revision 1 */ entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper ASID emulation to nested SVM */ entry->ecx = 0; /* Reserved */ cpuid_entry_override(entry, CPUID_8000_000A_EDX); break; case 0x80000019: entry->ecx = entry->edx = 0; break; case 0x8000001a: entry->eax &= GENMASK(2, 0); entry->ebx = entry->ecx = entry->edx = 0; break; case 0x8000001e: /* Do not return host topology information. */ entry->eax = entry->ebx = entry->ecx = 0; entry->edx = 0; /* reserved */ break; case 0x8000001F: if (!kvm_cpu_cap_has(X86_FEATURE_SEV)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; } else { cpuid_entry_override(entry, CPUID_8000_001F_EAX); /* Clear NumVMPL since KVM does not support VMPL. */ entry->ebx &= ~GENMASK(31, 12); /* * Enumerate '0' for "PA bits reduction", the adjusted * MAXPHYADDR is enumerated directly (see 0x80000008). */ entry->ebx &= ~GENMASK(11, 6); } break; case 0x80000020: entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; case 0x80000021: entry->ebx = entry->ecx = entry->edx = 0; cpuid_entry_override(entry, CPUID_8000_0021_EAX); break; /* AMD Extended Performance Monitoring and Debug */ case 0x80000022: { union cpuid_0x80000022_ebx ebx; entry->ecx = entry->edx = 0; if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) { entry->eax = entry->ebx; break; } cpuid_entry_override(entry, CPUID_8000_0022_EAX); if (kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) ebx.split.num_core_pmc = kvm_pmu_cap.num_counters_gp; else if (kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE)) ebx.split.num_core_pmc = AMD64_NUM_COUNTERS_CORE; else ebx.split.num_core_pmc = AMD64_NUM_COUNTERS; entry->ebx = ebx.full; break; } /*Add support for Centaur's CPUID instruction*/ case 0xC0000000: /*Just support up to 0xC0000004 now*/ entry->eax = min(entry->eax, 0xC0000004); break; case 0xC0000001: cpuid_entry_override(entry, CPUID_C000_0001_EDX); break; case 3: /* Processor serial number */ case 5: /* MONITOR/MWAIT */ case 0xC0000002: case 0xC0000003: case 0xC0000004: default: entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } r = 0; out: put_cpu(); return r; } static int do_cpuid_func(struct kvm_cpuid_array *array, u32 func, unsigned int type) { if (type == KVM_GET_EMULATED_CPUID) return __do_cpuid_func_emulated(array, func); return __do_cpuid_func(array, func); } #define CENTAUR_CPUID_SIGNATURE 0xC0000000 static int get_cpuid_func(struct kvm_cpuid_array *array, u32 func, unsigned int type) { u32 limit; int r; if (func == CENTAUR_CPUID_SIGNATURE && boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR) return 0; r = do_cpuid_func(array, func, type); if (r) return r; limit = array->entries[array->nent - 1].eax; for (func = func + 1; func <= limit; ++func) { r = do_cpuid_func(array, func, type); if (r) break; } return r; } static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries, __u32 num_entries, unsigned int ioctl_type) { int i; __u32 pad[3]; if (ioctl_type != KVM_GET_EMULATED_CPUID) return false; /* * We want to make sure that ->padding is being passed clean from * userspace in case we want to use it for something in the future. * * Sadly, this wasn't enforced for KVM_GET_SUPPORTED_CPUID and so we * have to give ourselves satisfied only with the emulated side. /me * sheds a tear. */ for (i = 0; i < num_entries; i++) { if (copy_from_user(pad, entries[i].padding, sizeof(pad))) return true; if (pad[0] || pad[1] || pad[2]) return true; } return false; } int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries, unsigned int type) { static const u32 funcs[] = { 0, 0x80000000, CENTAUR_CPUID_SIGNATURE, KVM_CPUID_SIGNATURE, }; struct kvm_cpuid_array array = { .nent = 0, }; int r, i; if (cpuid->nent < 1) return -E2BIG; if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) cpuid->nent = KVM_MAX_CPUID_ENTRIES; if (sanity_check_entries(entries, cpuid->nent, type)) return -EINVAL; array.entries = kvcalloc(cpuid->nent, sizeof(struct kvm_cpuid_entry2), GFP_KERNEL); if (!array.entries) return -ENOMEM; array.maxnent = cpuid->nent; for (i = 0; i < ARRAY_SIZE(funcs); i++) { r = get_cpuid_func(&array, funcs[i], type); if (r) goto out_free; } cpuid->nent = array.nent; if (copy_to_user(entries, array.entries, array.nent * sizeof(struct kvm_cpuid_entry2))) r = -EFAULT; out_free: kvfree(array.entries); return r; } struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, u32 function, u32 index) { return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, function, index); } EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry_index); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function) { return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, function, KVM_CPUID_INDEX_NOT_SIGNIFICANT); } EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); /* * Intel CPUID semantics treats any query for an out-of-range leaf as if the * highest basic leaf (i.e. CPUID.0H:EAX) were requested. AMD CPUID semantics * returns all zeroes for any undefined leaf, whether or not the leaf is in * range. Centaur/VIA follows Intel semantics. * * A leaf is considered out-of-range if its function is higher than the maximum * supported leaf of its associated class or if its associated class does not * exist. * * There are three primary classes to be considered, with their respective * ranges described as "<base> - <top>[,<base2> - <top2>] inclusive. A primary * class exists if a guest CPUID entry for its <base> leaf exists. For a given * class, CPUID.<base>.EAX contains the max supported leaf for the class. * * - Basic: 0x00000000 - 0x3fffffff, 0x50000000 - 0x7fffffff * - Hypervisor: 0x40000000 - 0x4fffffff * - Extended: 0x80000000 - 0xbfffffff * - Centaur: 0xc0000000 - 0xcfffffff * * The Hypervisor class is further subdivided into sub-classes that each act as * their own independent class associated with a 0x100 byte range. E.g. if Qemu * is advertising support for both HyperV and KVM, the resulting Hypervisor * CPUID sub-classes are: * * - HyperV: 0x40000000 - 0x400000ff * - KVM: 0x40000100 - 0x400001ff */ static struct kvm_cpuid_entry2 * get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) { struct kvm_cpuid_entry2 *basic, *class; u32 function = *fn_ptr; basic = kvm_find_cpuid_entry(vcpu, 0); if (!basic) return NULL; if (is_guest_vendor_amd(basic->ebx, basic->ecx, basic->edx) || is_guest_vendor_hygon(basic->ebx, basic->ecx, basic->edx)) return NULL; if (function >= 0x40000000 && function <= 0x4fffffff) class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00); else if (function >= 0xc0000000) class = kvm_find_cpuid_entry(vcpu, 0xc0000000); else class = kvm_find_cpuid_entry(vcpu, function & 0x80000000); if (class && function <= class->eax) return NULL; /* * Leaf specific adjustments are also applied when redirecting to the * max basic entry, e.g. if the max basic leaf is 0xb but there is no * entry for CPUID.0xb.index (see below), then the output value for EDX * needs to be pulled from CPUID.0xb.1. */ *fn_ptr = basic->eax; /* * The class does not exist or the requested function is out of range; * the effective CPUID entry is the max basic leaf. Note, the index of * the original requested leaf is observed! */ return kvm_find_cpuid_entry_index(vcpu, basic->eax, index); } bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool exact_only) { u32 orig_function = *eax, function = *eax, index = *ecx; struct kvm_cpuid_entry2 *entry; bool exact, used_max_basic = false; entry = kvm_find_cpuid_entry_index(vcpu, function, index); exact = !!entry; if (!entry && !exact_only) { entry = get_out_of_range_cpuid_entry(vcpu, &function, index); used_max_basic = !!entry; } if (entry) { *eax = entry->eax; *ebx = entry->ebx; *ecx = entry->ecx; *edx = entry->edx; if (function == 7 && index == 0) { u64 data; if (!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) && (data & TSX_CTRL_CPUID_CLEAR)) *ebx &= ~(F(RTM) | F(HLE)); } else if (function == 0x80000007) { if (kvm_hv_invtsc_suppressed(vcpu)) *edx &= ~SF(CONSTANT_TSC); } } else { *eax = *ebx = *ecx = *edx = 0; /* * When leaf 0BH or 1FH is defined, CL is pass-through * and EDX is always the x2APIC ID, even for undefined * subleaves. Index 1 will exist iff the leaf is * implemented, so we pass through CL iff leaf 1 * exists. EDX can be copied from any existing index. */ if (function == 0xb || function == 0x1f) { entry = kvm_find_cpuid_entry_index(vcpu, function, 1); if (entry) { *ecx = index & 0xff; *edx = entry->edx; } } } trace_kvm_cpuid(orig_function, index, *eax, *ebx, *ecx, *edx, exact, used_max_basic); return exact; } EXPORT_SYMBOL_GPL(kvm_cpuid); int kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { u32 eax, ebx, ecx, edx; if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, 0)) return 1; eax = kvm_rax_read(vcpu); ecx = kvm_rcx_read(vcpu); kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false); kvm_rax_write(vcpu, eax); kvm_rbx_write(vcpu, ebx); kvm_rcx_write(vcpu, ecx); kvm_rdx_write(vcpu, edx); return kvm_skip_emulated_instruction(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 3 1 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 // SPDX-License-Identifier: GPL-2.0-only /* * A driver for the Griffin Technology, Inc. "PowerMate" USB controller dial. * * v1.1, (c)2002 William R Sowerbutts <will@sowerbutts.com> * * This device is a anodised aluminium knob which connects over USB. It can measure * clockwise and anticlockwise rotation. The dial also acts as a pushbutton with * a spring for automatic release. The base contains a pair of LEDs which illuminate * the translucent base. It rotates without limit and reports its relative rotation * back to the host when polled by the USB controller. * * Testing with the knob I have has shown that it measures approximately 94 "clicks" * for one full rotation. Testing with my High Speed Rotation Actuator (ok, it was * a variable speed cordless electric drill) has shown that the device can measure * speeds of up to 7 clicks either clockwise or anticlockwise between pollings from * the host. If it counts more than 7 clicks before it is polled, it will wrap back * to zero and start counting again. This was at quite high speed, however, almost * certainly faster than the human hand could turn it. Griffin say that it loses a * pulse or two on a direction change; the granularity is so fine that I never * noticed this in practice. * * The device's microcontroller can be programmed to set the LED to either a constant * intensity, or to a rhythmic pulsing. Several patterns and speeds are available. * * Griffin were very happy to provide documentation and free hardware for development. * * Some userspace tools are available on the web: http://sowerbutts.com/powermate/ * */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/usb/input.h> #define POWERMATE_VENDOR 0x077d /* Griffin Technology, Inc. */ #define POWERMATE_PRODUCT_NEW 0x0410 /* Griffin PowerMate */ #define POWERMATE_PRODUCT_OLD 0x04AA /* Griffin soundKnob */ #define CONTOUR_VENDOR 0x05f3 /* Contour Design, Inc. */ #define CONTOUR_JOG 0x0240 /* Jog and Shuttle */ /* these are the command codes we send to the device */ #define SET_STATIC_BRIGHTNESS 0x01 #define SET_PULSE_ASLEEP 0x02 #define SET_PULSE_AWAKE 0x03 #define SET_PULSE_MODE 0x04 /* these refer to bits in the powermate_device's requires_update field. */ #define UPDATE_STATIC_BRIGHTNESS (1<<0) #define UPDATE_PULSE_ASLEEP (1<<1) #define UPDATE_PULSE_AWAKE (1<<2) #define UPDATE_PULSE_MODE (1<<3) /* at least two versions of the hardware exist, with differing payload sizes. the first three bytes always contain the "interesting" data in the relevant format. */ #define POWERMATE_PAYLOAD_SIZE_MAX 6 #define POWERMATE_PAYLOAD_SIZE_MIN 3 struct powermate_device { signed char *data; dma_addr_t data_dma; struct urb *irq, *config; struct usb_ctrlrequest *configcr; struct usb_device *udev; struct usb_interface *intf; struct input_dev *input; spinlock_t lock; int static_brightness; int pulse_speed; int pulse_table; int pulse_asleep; int pulse_awake; int requires_update; // physical settings which are out of sync char phys[64]; }; static char pm_name_powermate[] = "Griffin PowerMate"; static char pm_name_soundknob[] = "Griffin SoundKnob"; static void powermate_config_complete(struct urb *urb); /* Callback for data arriving from the PowerMate over the USB interrupt pipe */ static void powermate_irq(struct urb *urb) { struct powermate_device *pm = urb->context; struct device *dev = &pm->intf->dev; int retval; switch (urb->status) { case 0: /* success */ break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: /* this urb is terminated, clean up */ dev_dbg(dev, "%s - urb shutting down with status: %d\n", __func__, urb->status); return; default: dev_dbg(dev, "%s - nonzero urb status received: %d\n", __func__, urb->status); goto exit; } /* handle updates to device state */ input_report_key(pm->input, BTN_0, pm->data[0] & 0x01); input_report_rel(pm->input, REL_DIAL, pm->data[1]); input_sync(pm->input); exit: retval = usb_submit_urb (urb, GFP_ATOMIC); if (retval) dev_err(dev, "%s - usb_submit_urb failed with result: %d\n", __func__, retval); } /* Decide if we need to issue a control message and do so. Must be called with pm->lock taken */ static void powermate_sync_state(struct powermate_device *pm) { if (pm->requires_update == 0) return; /* no updates are required */ if (pm->config->status == -EINPROGRESS) return; /* an update is already in progress; it'll issue this update when it completes */ if (pm->requires_update & UPDATE_PULSE_ASLEEP){ pm->configcr->wValue = cpu_to_le16( SET_PULSE_ASLEEP ); pm->configcr->wIndex = cpu_to_le16( pm->pulse_asleep ? 1 : 0 ); pm->requires_update &= ~UPDATE_PULSE_ASLEEP; }else if (pm->requires_update & UPDATE_PULSE_AWAKE){ pm->configcr->wValue = cpu_to_le16( SET_PULSE_AWAKE ); pm->configcr->wIndex = cpu_to_le16( pm->pulse_awake ? 1 : 0 ); pm->requires_update &= ~UPDATE_PULSE_AWAKE; }else if (pm->requires_update & UPDATE_PULSE_MODE){ int op, arg; /* the powermate takes an operation and an argument for its pulse algorithm. the operation can be: 0: divide the speed 1: pulse at normal speed 2: multiply the speed the argument only has an effect for operations 0 and 2, and ranges between 1 (least effect) to 255 (maximum effect). thus, several states are equivalent and are coalesced into one state. we map this onto a range from 0 to 510, with: 0 -- 254 -- use divide (0 = slowest) 255 -- use normal speed 256 -- 510 -- use multiple (510 = fastest). Only values of 'arg' quite close to 255 are particularly useful/spectacular. */ if (pm->pulse_speed < 255) { op = 0; // divide arg = 255 - pm->pulse_speed; } else if (pm->pulse_speed > 255) { op = 2; // multiply arg = pm->pulse_speed - 255; } else { op = 1; // normal speed arg = 0; // can be any value } pm->configcr->wValue = cpu_to_le16( (pm->pulse_table << 8) | SET_PULSE_MODE ); pm->configcr->wIndex = cpu_to_le16( (arg << 8) | op ); pm->requires_update &= ~UPDATE_PULSE_MODE; } else if (pm->requires_update & UPDATE_STATIC_BRIGHTNESS) { pm->configcr->wValue = cpu_to_le16( SET_STATIC_BRIGHTNESS ); pm->configcr->wIndex = cpu_to_le16( pm->static_brightness ); pm->requires_update &= ~UPDATE_STATIC_BRIGHTNESS; } else { printk(KERN_ERR "powermate: unknown update required"); pm->requires_update = 0; /* fudge the bug */ return; } /* printk("powermate: %04x %04x\n", pm->configcr->wValue, pm->configcr->wIndex); */ pm->configcr->bRequestType = 0x41; /* vendor request */ pm->configcr->bRequest = 0x01; pm->configcr->wLength = 0; usb_fill_control_urb(pm->config, pm->udev, usb_sndctrlpipe(pm->udev, 0), (void *) pm->configcr, NULL, 0, powermate_config_complete, pm); if (usb_submit_urb(pm->config, GFP_ATOMIC)) printk(KERN_ERR "powermate: usb_submit_urb(config) failed"); } /* Called when our asynchronous control message completes. We may need to issue another immediately */ static void powermate_config_complete(struct urb *urb) { struct powermate_device *pm = urb->context; unsigned long flags; if (urb->status) printk(KERN_ERR "powermate: config urb returned %d\n", urb->status); spin_lock_irqsave(&pm->lock, flags); powermate_sync_state(pm); spin_unlock_irqrestore(&pm->lock, flags); } /* Set the LED up as described and begin the sync with the hardware if required */ static void powermate_pulse_led(struct powermate_device *pm, int static_brightness, int pulse_speed, int pulse_table, int pulse_asleep, int pulse_awake) { unsigned long flags; if (pulse_speed < 0) pulse_speed = 0; if (pulse_table < 0) pulse_table = 0; if (pulse_speed > 510) pulse_speed = 510; if (pulse_table > 2) pulse_table = 2; pulse_asleep = !!pulse_asleep; pulse_awake = !!pulse_awake; spin_lock_irqsave(&pm->lock, flags); /* mark state updates which are required */ if (static_brightness != pm->static_brightness) { pm->static_brightness = static_brightness; pm->requires_update |= UPDATE_STATIC_BRIGHTNESS; } if (pulse_asleep != pm->pulse_asleep) { pm->pulse_asleep = pulse_asleep; pm->requires_update |= (UPDATE_PULSE_ASLEEP | UPDATE_STATIC_BRIGHTNESS); } if (pulse_awake != pm->pulse_awake) { pm->pulse_awake = pulse_awake; pm->requires_update |= (UPDATE_PULSE_AWAKE | UPDATE_STATIC_BRIGHTNESS); } if (pulse_speed != pm->pulse_speed || pulse_table != pm->pulse_table) { pm->pulse_speed = pulse_speed; pm->pulse_table = pulse_table; pm->requires_update |= UPDATE_PULSE_MODE; } powermate_sync_state(pm); spin_unlock_irqrestore(&pm->lock, flags); } /* Callback from the Input layer when an event arrives from userspace to configure the LED */ static int powermate_input_event(struct input_dev *dev, unsigned int type, unsigned int code, int _value) { unsigned int command = (unsigned int)_value; struct powermate_device *pm = input_get_drvdata(dev); if (type == EV_MSC && code == MSC_PULSELED){ /* bits 0- 7: 8 bits: LED brightness bits 8-16: 9 bits: pulsing speed modifier (0 ... 510); 0-254 = slower, 255 = standard, 256-510 = faster. bits 17-18: 2 bits: pulse table (0, 1, 2 valid) bit 19: 1 bit : pulse whilst asleep? bit 20: 1 bit : pulse constantly? */ int static_brightness = command & 0xFF; // bits 0-7 int pulse_speed = (command >> 8) & 0x1FF; // bits 8-16 int pulse_table = (command >> 17) & 0x3; // bits 17-18 int pulse_asleep = (command >> 19) & 0x1; // bit 19 int pulse_awake = (command >> 20) & 0x1; // bit 20 powermate_pulse_led(pm, static_brightness, pulse_speed, pulse_table, pulse_asleep, pulse_awake); } return 0; } static int powermate_alloc_buffers(struct usb_device *udev, struct powermate_device *pm) { pm->data = usb_alloc_coherent(udev, POWERMATE_PAYLOAD_SIZE_MAX, GFP_KERNEL, &pm->data_dma); if (!pm->data) return -1; pm->configcr = kmalloc(sizeof(*(pm->configcr)), GFP_KERNEL); if (!pm->configcr) return -ENOMEM; return 0; } static void powermate_free_buffers(struct usb_device *udev, struct powermate_device *pm) { usb_free_coherent(udev, POWERMATE_PAYLOAD_SIZE_MAX, pm->data, pm->data_dma); kfree(pm->configcr); } /* Called whenever a USB device matching one in our supported devices table is connected */ static int powermate_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev (intf); struct usb_host_interface *interface; struct usb_endpoint_descriptor *endpoint; struct powermate_device *pm; struct input_dev *input_dev; int pipe, maxp; int error = -ENOMEM; interface = intf->cur_altsetting; if (interface->desc.bNumEndpoints < 1) return -EINVAL; endpoint = &interface->endpoint[0].desc; if (!usb_endpoint_is_int_in(endpoint)) return -EIO; usb_control_msg(udev, usb_sndctrlpipe(udev, 0), 0x0a, USB_TYPE_CLASS | USB_RECIP_INTERFACE, 0, interface->desc.bInterfaceNumber, NULL, 0, USB_CTRL_SET_TIMEOUT); pm = kzalloc(sizeof(struct powermate_device), GFP_KERNEL); input_dev = input_allocate_device(); if (!pm || !input_dev) goto fail1; if (powermate_alloc_buffers(udev, pm)) goto fail2; pm->irq = usb_alloc_urb(0, GFP_KERNEL); if (!pm->irq) goto fail2; pm->config = usb_alloc_urb(0, GFP_KERNEL); if (!pm->config) goto fail3; pm->udev = udev; pm->intf = intf; pm->input = input_dev; usb_make_path(udev, pm->phys, sizeof(pm->phys)); strlcat(pm->phys, "/input0", sizeof(pm->phys)); spin_lock_init(&pm->lock); switch (le16_to_cpu(udev->descriptor.idProduct)) { case POWERMATE_PRODUCT_NEW: input_dev->name = pm_name_powermate; break; case POWERMATE_PRODUCT_OLD: input_dev->name = pm_name_soundknob; break; default: input_dev->name = pm_name_soundknob; printk(KERN_WARNING "powermate: unknown product id %04x\n", le16_to_cpu(udev->descriptor.idProduct)); } input_dev->phys = pm->phys; usb_to_input_id(udev, &input_dev->id); input_dev->dev.parent = &intf->dev; input_set_drvdata(input_dev, pm); input_dev->event = powermate_input_event; input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL) | BIT_MASK(EV_MSC); input_dev->keybit[BIT_WORD(BTN_0)] = BIT_MASK(BTN_0); input_dev->relbit[BIT_WORD(REL_DIAL)] = BIT_MASK(REL_DIAL); input_dev->mscbit[BIT_WORD(MSC_PULSELED)] = BIT_MASK(MSC_PULSELED); /* get a handle to the interrupt data pipe */ pipe = usb_rcvintpipe(udev, endpoint->bEndpointAddress); maxp = usb_maxpacket(udev, pipe); if (maxp < POWERMATE_PAYLOAD_SIZE_MIN || maxp > POWERMATE_PAYLOAD_SIZE_MAX) { printk(KERN_WARNING "powermate: Expected payload of %d--%d bytes, found %d bytes!\n", POWERMATE_PAYLOAD_SIZE_MIN, POWERMATE_PAYLOAD_SIZE_MAX, maxp); maxp = POWERMATE_PAYLOAD_SIZE_MAX; } usb_fill_int_urb(pm->irq, udev, pipe, pm->data, maxp, powermate_irq, pm, endpoint->bInterval); pm->irq->transfer_dma = pm->data_dma; pm->irq->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; /* register our interrupt URB with the USB system */ if (usb_submit_urb(pm->irq, GFP_KERNEL)) { error = -EIO; goto fail4; } error = input_register_device(pm->input); if (error) goto fail5; /* force an update of everything */ pm->requires_update = UPDATE_PULSE_ASLEEP | UPDATE_PULSE_AWAKE | UPDATE_PULSE_MODE | UPDATE_STATIC_BRIGHTNESS; powermate_pulse_led(pm, 0x80, 255, 0, 1, 0); // set default pulse parameters usb_set_intfdata(intf, pm); return 0; fail5: usb_kill_urb(pm->irq); fail4: usb_free_urb(pm->config); fail3: usb_free_urb(pm->irq); fail2: powermate_free_buffers(udev, pm); fail1: input_free_device(input_dev); kfree(pm); return error; } /* Called when a USB device we've accepted ownership of is removed */ static void powermate_disconnect(struct usb_interface *intf) { struct powermate_device *pm = usb_get_intfdata (intf); usb_set_intfdata(intf, NULL); if (pm) { pm->requires_update = 0; usb_kill_urb(pm->irq); input_unregister_device(pm->input); usb_kill_urb(pm->config); usb_free_urb(pm->irq); usb_free_urb(pm->config); powermate_free_buffers(interface_to_usbdev(intf), pm); kfree(pm); } } static const struct usb_device_id powermate_devices[] = { { USB_DEVICE(POWERMATE_VENDOR, POWERMATE_PRODUCT_NEW) }, { USB_DEVICE(POWERMATE_VENDOR, POWERMATE_PRODUCT_OLD) }, { USB_DEVICE(CONTOUR_VENDOR, CONTOUR_JOG) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE (usb, powermate_devices); static struct usb_driver powermate_driver = { .name = "powermate", .probe = powermate_probe, .disconnect = powermate_disconnect, .id_table = powermate_devices, }; module_usb_driver(powermate_driver); MODULE_AUTHOR( "William R Sowerbutts" ); MODULE_DESCRIPTION( "Griffin Technology, Inc PowerMate driver" ); MODULE_LICENSE("GPL");
13 16 16 13 13 13 13 13 13 13 1 1 1 20 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/ceph/libceph.h> #include <linux/ceph/osdmap.h> #include <linux/ceph/decode.h> #include <linux/crush/hash.h> #include <linux/crush/mapper.h> static __printf(2, 3) void osdmap_info(const struct ceph_osdmap *map, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_INFO "%s (%pU e%u): %pV", KBUILD_MODNAME, &map->fsid, map->epoch, &vaf); va_end(args); } char *ceph_osdmap_state_str(char *str, int len, u32 state) { if (!len) return str; if ((state & CEPH_OSD_EXISTS) && (state & CEPH_OSD_UP)) snprintf(str, len, "exists, up"); else if (state & CEPH_OSD_EXISTS) snprintf(str, len, "exists"); else if (state & CEPH_OSD_UP) snprintf(str, len, "up"); else snprintf(str, len, "doesn't exist"); return str; } /* maps */ static int calc_bits_of(unsigned int t) { int b = 0; while (t) { t = t >> 1; b++; } return b; } /* * the foo_mask is the smallest value 2^n-1 that is >= foo. */ static void calc_pg_masks(struct ceph_pg_pool_info *pi) { pi->pg_num_mask = (1 << calc_bits_of(pi->pg_num-1)) - 1; pi->pgp_num_mask = (1 << calc_bits_of(pi->pgp_num-1)) - 1; } /* * decode crush map */ static int crush_decode_uniform_bucket(void **p, void *end, struct crush_bucket_uniform *b) { dout("crush_decode_uniform_bucket %p to %p\n", *p, end); ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad); b->item_weight = ceph_decode_32(p); return 0; bad: return -EINVAL; } static int crush_decode_list_bucket(void **p, void *end, struct crush_bucket_list *b) { int j; dout("crush_decode_list_bucket %p to %p\n", *p, end); b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); if (b->item_weights == NULL) return -ENOMEM; b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); if (b->sum_weights == NULL) return -ENOMEM; ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad); for (j = 0; j < b->h.size; j++) { b->item_weights[j] = ceph_decode_32(p); b->sum_weights[j] = ceph_decode_32(p); } return 0; bad: return -EINVAL; } static int crush_decode_tree_bucket(void **p, void *end, struct crush_bucket_tree *b) { int j; dout("crush_decode_tree_bucket %p to %p\n", *p, end); ceph_decode_8_safe(p, end, b->num_nodes, bad); b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS); if (b->node_weights == NULL) return -ENOMEM; ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad); for (j = 0; j < b->num_nodes; j++) b->node_weights[j] = ceph_decode_32(p); return 0; bad: return -EINVAL; } static int crush_decode_straw_bucket(void **p, void *end, struct crush_bucket_straw *b) { int j; dout("crush_decode_straw_bucket %p to %p\n", *p, end); b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); if (b->item_weights == NULL) return -ENOMEM; b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); if (b->straws == NULL) return -ENOMEM; ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad); for (j = 0; j < b->h.size; j++) { b->item_weights[j] = ceph_decode_32(p); b->straws[j] = ceph_decode_32(p); } return 0; bad: return -EINVAL; } static int crush_decode_straw2_bucket(void **p, void *end, struct crush_bucket_straw2 *b) { int j; dout("crush_decode_straw2_bucket %p to %p\n", *p, end); b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); if (b->item_weights == NULL) return -ENOMEM; ceph_decode_need(p, end, b->h.size * sizeof(u32), bad); for (j = 0; j < b->h.size; j++) b->item_weights[j] = ceph_decode_32(p); return 0; bad: return -EINVAL; } struct crush_name_node { struct rb_node cn_node; int cn_id; char cn_name[]; }; static struct crush_name_node *alloc_crush_name(size_t name_len) { struct crush_name_node *cn; cn = kmalloc(sizeof(*cn) + name_len + 1, GFP_NOIO); if (!cn) return NULL; RB_CLEAR_NODE(&cn->cn_node); return cn; } static void free_crush_name(struct crush_name_node *cn) { WARN_ON(!RB_EMPTY_NODE(&cn->cn_node)); kfree(cn); } DEFINE_RB_FUNCS(crush_name, struct crush_name_node, cn_id, cn_node) static int decode_crush_names(void **p, void *end, struct rb_root *root) { u32 n; ceph_decode_32_safe(p, end, n, e_inval); while (n--) { struct crush_name_node *cn; int id; u32 name_len; ceph_decode_32_safe(p, end, id, e_inval); ceph_decode_32_safe(p, end, name_len, e_inval); ceph_decode_need(p, end, name_len, e_inval); cn = alloc_crush_name(name_len); if (!cn) return -ENOMEM; cn->cn_id = id; memcpy(cn->cn_name, *p, name_len); cn->cn_name[name_len] = '\0'; *p += name_len; if (!__insert_crush_name(root, cn)) { free_crush_name(cn); return -EEXIST; } } return 0; e_inval: return -EINVAL; } void clear_crush_names(struct rb_root *root) { while (!RB_EMPTY_ROOT(root)) { struct crush_name_node *cn = rb_entry(rb_first(root), struct crush_name_node, cn_node); erase_crush_name(root, cn); free_crush_name(cn); } } static struct crush_choose_arg_map *alloc_choose_arg_map(void) { struct crush_choose_arg_map *arg_map; arg_map = kzalloc(sizeof(*arg_map), GFP_NOIO); if (!arg_map) return NULL; RB_CLEAR_NODE(&arg_map->node); return arg_map; } static void free_choose_arg_map(struct crush_choose_arg_map *arg_map) { if (arg_map) { int i, j; WARN_ON(!RB_EMPTY_NODE(&arg_map->node)); for (i = 0; i < arg_map->size; i++) { struct crush_choose_arg *arg = &arg_map->args[i]; for (j = 0; j < arg->weight_set_size; j++) kfree(arg->weight_set[j].weights); kfree(arg->weight_set); kfree(arg->ids); } kfree(arg_map->args); kfree(arg_map); } } DEFINE_RB_FUNCS(choose_arg_map, struct crush_choose_arg_map, choose_args_index, node); void clear_choose_args(struct crush_map *c) { while (!RB_EMPTY_ROOT(&c->choose_args)) { struct crush_choose_arg_map *arg_map = rb_entry(rb_first(&c->choose_args), struct crush_choose_arg_map, node); erase_choose_arg_map(&c->choose_args, arg_map); free_choose_arg_map(arg_map); } } static u32 *decode_array_32_alloc(void **p, void *end, u32 *plen) { u32 *a = NULL; u32 len; int ret; ceph_decode_32_safe(p, end, len, e_inval); if (len) { u32 i; a = kmalloc_array(len, sizeof(u32), GFP_NOIO); if (!a) { ret = -ENOMEM; goto fail; } ceph_decode_need(p, end, len * sizeof(u32), e_inval); for (i = 0; i < len; i++) a[i] = ceph_decode_32(p); } *plen = len; return a; e_inval: ret = -EINVAL; fail: kfree(a); return ERR_PTR(ret); } /* * Assumes @arg is zero-initialized. */ static int decode_choose_arg(void **p, void *end, struct crush_choose_arg *arg) { int ret; ceph_decode_32_safe(p, end, arg->weight_set_size, e_inval); if (arg->weight_set_size) { u32 i; arg->weight_set = kmalloc_array(arg->weight_set_size, sizeof(*arg->weight_set), GFP_NOIO); if (!arg->weight_set) return -ENOMEM; for (i = 0; i < arg->weight_set_size; i++) { struct crush_weight_set *w = &arg->weight_set[i]; w->weights = decode_array_32_alloc(p, end, &w->size); if (IS_ERR(w->weights)) { ret = PTR_ERR(w->weights); w->weights = NULL; return ret; } } } arg->ids = decode_array_32_alloc(p, end, &arg->ids_size); if (IS_ERR(arg->ids)) { ret = PTR_ERR(arg->ids); arg->ids = NULL; return ret; } return 0; e_inval: return -EINVAL; } static int decode_choose_args(void **p, void *end, struct crush_map *c) { struct crush_choose_arg_map *arg_map = NULL; u32 num_choose_arg_maps, num_buckets; int ret; ceph_decode_32_safe(p, end, num_choose_arg_maps, e_inval); while (num_choose_arg_maps--) { arg_map = alloc_choose_arg_map(); if (!arg_map) { ret = -ENOMEM; goto fail; } ceph_decode_64_safe(p, end, arg_map->choose_args_index, e_inval); arg_map->size = c->max_buckets; arg_map->args = kcalloc(arg_map->size, sizeof(*arg_map->args), GFP_NOIO); if (!arg_map->args) { ret = -ENOMEM; goto fail; } ceph_decode_32_safe(p, end, num_buckets, e_inval); while (num_buckets--) { struct crush_choose_arg *arg; u32 bucket_index; ceph_decode_32_safe(p, end, bucket_index, e_inval); if (bucket_index >= arg_map->size) goto e_inval; arg = &arg_map->args[bucket_index]; ret = decode_choose_arg(p, end, arg); if (ret) goto fail; if (arg->ids_size && arg->ids_size != c->buckets[bucket_index]->size) goto e_inval; } insert_choose_arg_map(&c->choose_args, arg_map); } return 0; e_inval: ret = -EINVAL; fail: free_choose_arg_map(arg_map); return ret; } static void crush_finalize(struct crush_map *c) { __s32 b; /* Space for the array of pointers to per-bucket workspace */ c->working_size = sizeof(struct crush_work) + c->max_buckets * sizeof(struct crush_work_bucket *); for (b = 0; b < c->max_buckets; b++) { if (!c->buckets[b]) continue; switch (c->buckets[b]->alg) { default: /* * The base case, permutation variables and * the pointer to the permutation array. */ c->working_size += sizeof(struct crush_work_bucket); break; } /* Every bucket has a permutation array. */ c->working_size += c->buckets[b]->size * sizeof(__u32); } } static struct crush_map *crush_decode(void *pbyval, void *end) { struct crush_map *c; int err; int i, j; void **p = &pbyval; void *start = pbyval; u32 magic; dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); c = kzalloc(sizeof(*c), GFP_NOFS); if (c == NULL) return ERR_PTR(-ENOMEM); c->type_names = RB_ROOT; c->names = RB_ROOT; c->choose_args = RB_ROOT; /* set tunables to default values */ c->choose_local_tries = 2; c->choose_local_fallback_tries = 5; c->choose_total_tries = 19; c->chooseleaf_descend_once = 0; ceph_decode_need(p, end, 4*sizeof(u32), bad); magic = ceph_decode_32(p); if (magic != CRUSH_MAGIC) { pr_err("crush_decode magic %x != current %x\n", (unsigned int)magic, (unsigned int)CRUSH_MAGIC); goto bad; } c->max_buckets = ceph_decode_32(p); c->max_rules = ceph_decode_32(p); c->max_devices = ceph_decode_32(p); c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS); if (c->buckets == NULL) goto badmem; c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS); if (c->rules == NULL) goto badmem; /* buckets */ for (i = 0; i < c->max_buckets; i++) { int size = 0; u32 alg; struct crush_bucket *b; ceph_decode_32_safe(p, end, alg, bad); if (alg == 0) { c->buckets[i] = NULL; continue; } dout("crush_decode bucket %d off %x %p to %p\n", i, (int)(*p-start), *p, end); switch (alg) { case CRUSH_BUCKET_UNIFORM: size = sizeof(struct crush_bucket_uniform); break; case CRUSH_BUCKET_LIST: size = sizeof(struct crush_bucket_list); break; case CRUSH_BUCKET_TREE: size = sizeof(struct crush_bucket_tree); break; case CRUSH_BUCKET_STRAW: size = sizeof(struct crush_bucket_straw); break; case CRUSH_BUCKET_STRAW2: size = sizeof(struct crush_bucket_straw2); break; default: goto bad; } BUG_ON(size == 0); b = c->buckets[i] = kzalloc(size, GFP_NOFS); if (b == NULL) goto badmem; ceph_decode_need(p, end, 4*sizeof(u32), bad); b->id = ceph_decode_32(p); b->type = ceph_decode_16(p); b->alg = ceph_decode_8(p); b->hash = ceph_decode_8(p); b->weight = ceph_decode_32(p); b->size = ceph_decode_32(p); dout("crush_decode bucket size %d off %x %p to %p\n", b->size, (int)(*p-start), *p, end); b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS); if (b->items == NULL) goto badmem; ceph_decode_need(p, end, b->size*sizeof(u32), bad); for (j = 0; j < b->size; j++) b->items[j] = ceph_decode_32(p); switch (b->alg) { case CRUSH_BUCKET_UNIFORM: err = crush_decode_uniform_bucket(p, end, (struct crush_bucket_uniform *)b); if (err < 0) goto fail; break; case CRUSH_BUCKET_LIST: err = crush_decode_list_bucket(p, end, (struct crush_bucket_list *)b); if (err < 0) goto fail; break; case CRUSH_BUCKET_TREE: err = crush_decode_tree_bucket(p, end, (struct crush_bucket_tree *)b); if (err < 0) goto fail; break; case CRUSH_BUCKET_STRAW: err = crush_decode_straw_bucket(p, end, (struct crush_bucket_straw *)b); if (err < 0) goto fail; break; case CRUSH_BUCKET_STRAW2: err = crush_decode_straw2_bucket(p, end, (struct crush_bucket_straw2 *)b); if (err < 0) goto fail; break; } } /* rules */ dout("rule vec is %p\n", c->rules); for (i = 0; i < c->max_rules; i++) { u32 yes; struct crush_rule *r; ceph_decode_32_safe(p, end, yes, bad); if (!yes) { dout("crush_decode NO rule %d off %x %p to %p\n", i, (int)(*p-start), *p, end); c->rules[i] = NULL; continue; } dout("crush_decode rule %d off %x %p to %p\n", i, (int)(*p-start), *p, end); /* len */ ceph_decode_32_safe(p, end, yes, bad); #if BITS_PER_LONG == 32 if (yes > (ULONG_MAX - sizeof(*r)) / sizeof(struct crush_rule_step)) goto bad; #endif r = kmalloc(struct_size(r, steps, yes), GFP_NOFS); if (r == NULL) goto badmem; dout(" rule %d is at %p\n", i, r); c->rules[i] = r; r->len = yes; ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */ ceph_decode_need(p, end, r->len*3*sizeof(u32), bad); for (j = 0; j < r->len; j++) { r->steps[j].op = ceph_decode_32(p); r->steps[j].arg1 = ceph_decode_32(p); r->steps[j].arg2 = ceph_decode_32(p); } } err = decode_crush_names(p, end, &c->type_names); if (err) goto fail; err = decode_crush_names(p, end, &c->names); if (err) goto fail; ceph_decode_skip_map(p, end, 32, string, bad); /* rule_name_map */ /* tunables */ ceph_decode_need(p, end, 3*sizeof(u32), done); c->choose_local_tries = ceph_decode_32(p); c->choose_local_fallback_tries = ceph_decode_32(p); c->choose_total_tries = ceph_decode_32(p); dout("crush decode tunable choose_local_tries = %d\n", c->choose_local_tries); dout("crush decode tunable choose_local_fallback_tries = %d\n", c->choose_local_fallback_tries); dout("crush decode tunable choose_total_tries = %d\n", c->choose_total_tries); ceph_decode_need(p, end, sizeof(u32), done); c->chooseleaf_descend_once = ceph_decode_32(p); dout("crush decode tunable chooseleaf_descend_once = %d\n", c->chooseleaf_descend_once); ceph_decode_need(p, end, sizeof(u8), done); c->chooseleaf_vary_r = ceph_decode_8(p); dout("crush decode tunable chooseleaf_vary_r = %d\n", c->chooseleaf_vary_r); /* skip straw_calc_version, allowed_bucket_algs */ ceph_decode_need(p, end, sizeof(u8) + sizeof(u32), done); *p += sizeof(u8) + sizeof(u32); ceph_decode_need(p, end, sizeof(u8), done); c->chooseleaf_stable = ceph_decode_8(p); dout("crush decode tunable chooseleaf_stable = %d\n", c->chooseleaf_stable); if (*p != end) { /* class_map */ ceph_decode_skip_map(p, end, 32, 32, bad); /* class_name */ ceph_decode_skip_map(p, end, 32, string, bad); /* class_bucket */ ceph_decode_skip_map_of_map(p, end, 32, 32, 32, bad); } if (*p != end) { err = decode_choose_args(p, end, c); if (err) goto fail; } done: crush_finalize(c); dout("crush_decode success\n"); return c; badmem: err = -ENOMEM; fail: dout("crush_decode fail %d\n", err); crush_destroy(c); return ERR_PTR(err); bad: err = -EINVAL; goto fail; } int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs) { if (lhs->pool < rhs->pool) return -1; if (lhs->pool > rhs->pool) return 1; if (lhs->seed < rhs->seed) return -1; if (lhs->seed > rhs->seed) return 1; return 0; } int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs) { int ret; ret = ceph_pg_compare(&lhs->pgid, &rhs->pgid); if (ret) return ret; if (lhs->shard < rhs->shard) return -1; if (lhs->shard > rhs->shard) return 1; return 0; } static struct ceph_pg_mapping *alloc_pg_mapping(size_t payload_len) { struct ceph_pg_mapping *pg; pg = kmalloc(sizeof(*pg) + payload_len, GFP_NOIO); if (!pg) return NULL; RB_CLEAR_NODE(&pg->node); return pg; } static void free_pg_mapping(struct ceph_pg_mapping *pg) { WARN_ON(!RB_EMPTY_NODE(&pg->node)); kfree(pg); } /* * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid * to a set of osds) and primary_temp (explicit primary setting) */ DEFINE_RB_FUNCS2(pg_mapping, struct ceph_pg_mapping, pgid, ceph_pg_compare, RB_BYPTR, const struct ceph_pg *, node) /* * rbtree of pg pool info */ DEFINE_RB_FUNCS(pg_pool, struct ceph_pg_pool_info, id, node) struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id) { return lookup_pg_pool(&map->pg_pools, id); } const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id) { struct ceph_pg_pool_info *pi; if (id == CEPH_NOPOOL) return NULL; if (WARN_ON_ONCE(id > (u64) INT_MAX)) return NULL; pi = lookup_pg_pool(&map->pg_pools, id); return pi ? pi->name : NULL; } EXPORT_SYMBOL(ceph_pg_pool_name_by_id); int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name) { struct rb_node *rbp; for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) { struct ceph_pg_pool_info *pi = rb_entry(rbp, struct ceph_pg_pool_info, node); if (pi->name && strcmp(pi->name, name) == 0) return pi->id; } return -ENOENT; } EXPORT_SYMBOL(ceph_pg_poolid_by_name); u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id) { struct ceph_pg_pool_info *pi; pi = lookup_pg_pool(&map->pg_pools, id); return pi ? pi->flags : 0; } EXPORT_SYMBOL(ceph_pg_pool_flags); static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) { erase_pg_pool(root, pi); kfree(pi->name); kfree(pi); } static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) { u8 ev, cv; unsigned len, num; void *pool_end; ceph_decode_need(p, end, 2 + 4, bad); ev = ceph_decode_8(p); /* encoding version */ cv = ceph_decode_8(p); /* compat version */ if (ev < 5) { pr_warn("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv); return -EINVAL; } if (cv > 9) { pr_warn("got v %d cv %d > 9 of ceph_pg_pool\n", ev, cv); return -EINVAL; } len = ceph_decode_32(p); ceph_decode_need(p, end, len, bad); pool_end = *p + len; pi->type = ceph_decode_8(p); pi->size = ceph_decode_8(p); pi->crush_ruleset = ceph_decode_8(p); pi->object_hash = ceph_decode_8(p); pi->pg_num = ceph_decode_32(p); pi->pgp_num = ceph_decode_32(p); *p += 4 + 4; /* skip lpg* */ *p += 4; /* skip last_change */ *p += 8 + 4; /* skip snap_seq, snap_epoch */ /* skip snaps */ num = ceph_decode_32(p); while (num--) { *p += 8; /* snapid key */ *p += 1 + 1; /* versions */ len = ceph_decode_32(p); *p += len; } /* skip removed_snaps */ num = ceph_decode_32(p); *p += num * (8 + 8); *p += 8; /* skip auid */ pi->flags = ceph_decode_64(p); *p += 4; /* skip crash_replay_interval */ if (ev >= 7) pi->min_size = ceph_decode_8(p); else pi->min_size = pi->size - pi->size / 2; if (ev >= 8) *p += 8 + 8; /* skip quota_max_* */ if (ev >= 9) { /* skip tiers */ num = ceph_decode_32(p); *p += num * 8; *p += 8; /* skip tier_of */ *p += 1; /* skip cache_mode */ pi->read_tier = ceph_decode_64(p); pi->write_tier = ceph_decode_64(p); } else { pi->read_tier = -1; pi->write_tier = -1; } if (ev >= 10) { /* skip properties */ num = ceph_decode_32(p); while (num--) { len = ceph_decode_32(p); *p += len; /* key */ len = ceph_decode_32(p); *p += len; /* val */ } } if (ev >= 11) { /* skip hit_set_params */ *p += 1 + 1; /* versions */ len = ceph_decode_32(p); *p += len; *p += 4; /* skip hit_set_period */ *p += 4; /* skip hit_set_count */ } if (ev >= 12) *p += 4; /* skip stripe_width */ if (ev >= 13) { *p += 8; /* skip target_max_bytes */ *p += 8; /* skip target_max_objects */ *p += 4; /* skip cache_target_dirty_ratio_micro */ *p += 4; /* skip cache_target_full_ratio_micro */ *p += 4; /* skip cache_min_flush_age */ *p += 4; /* skip cache_min_evict_age */ } if (ev >= 14) { /* skip erasure_code_profile */ len = ceph_decode_32(p); *p += len; } /* * last_force_op_resend_preluminous, will be overridden if the * map was encoded with RESEND_ON_SPLIT */ if (ev >= 15) pi->last_force_request_resend = ceph_decode_32(p); else pi->last_force_request_resend = 0; if (ev >= 16) *p += 4; /* skip min_read_recency_for_promote */ if (ev >= 17) *p += 8; /* skip expected_num_objects */ if (ev >= 19) *p += 4; /* skip cache_target_dirty_high_ratio_micro */ if (ev >= 20) *p += 4; /* skip min_write_recency_for_promote */ if (ev >= 21) *p += 1; /* skip use_gmt_hitset */ if (ev >= 22) *p += 1; /* skip fast_read */ if (ev >= 23) { *p += 4; /* skip hit_set_grade_decay_rate */ *p += 4; /* skip hit_set_search_last_n */ } if (ev >= 24) { /* skip opts */ *p += 1 + 1; /* versions */ len = ceph_decode_32(p); *p += len; } if (ev >= 25) pi->last_force_request_resend = ceph_decode_32(p); /* ignore the rest */ *p = pool_end; calc_pg_masks(pi); return 0; bad: return -EINVAL; } static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map) { struct ceph_pg_pool_info *pi; u32 num, len; u64 pool; ceph_decode_32_safe(p, end, num, bad); dout(" %d pool names\n", num); while (num--) { ceph_decode_64_safe(p, end, pool, bad); ceph_decode_32_safe(p, end, len, bad); dout(" pool %llu len %d\n", pool, len); ceph_decode_need(p, end, len, bad); pi = lookup_pg_pool(&map->pg_pools, pool); if (pi) { char *name = kstrndup(*p, len, GFP_NOFS); if (!name) return -ENOMEM; kfree(pi->name); pi->name = name; dout(" name is %s\n", pi->name); } *p += len; } return 0; bad: return -EINVAL; } /* * CRUSH workspaces * * workspace_manager framework borrowed from fs/btrfs/compression.c. * Two simplifications: there is only one type of workspace and there * is always at least one workspace. */ static struct crush_work *alloc_workspace(const struct crush_map *c) { struct crush_work *work; size_t work_size; WARN_ON(!c->working_size); work_size = crush_work_size(c, CEPH_PG_MAX_SIZE); dout("%s work_size %zu bytes\n", __func__, work_size); work = kvmalloc(work_size, GFP_NOIO); if (!work) return NULL; INIT_LIST_HEAD(&work->item); crush_init_workspace(c, work); return work; } static void free_workspace(struct crush_work *work) { WARN_ON(!list_empty(&work->item)); kvfree(work); } static void init_workspace_manager(struct workspace_manager *wsm) { INIT_LIST_HEAD(&wsm->idle_ws); spin_lock_init(&wsm->ws_lock); atomic_set(&wsm->total_ws, 0); wsm->free_ws = 0; init_waitqueue_head(&wsm->ws_wait); } static void add_initial_workspace(struct workspace_manager *wsm, struct crush_work *work) { WARN_ON(!list_empty(&wsm->idle_ws)); list_add(&work->item, &wsm->idle_ws); atomic_set(&wsm->total_ws, 1); wsm->free_ws = 1; } static void cleanup_workspace_manager(struct workspace_manager *wsm) { struct crush_work *work; while (!list_empty(&wsm->idle_ws)) { work = list_first_entry(&wsm->idle_ws, struct crush_work, item); list_del_init(&work->item); free_workspace(work); } atomic_set(&wsm->total_ws, 0); wsm->free_ws = 0; } /* * Finds an available workspace or allocates a new one. If it's not * possible to allocate a new one, waits until there is one. */ static struct crush_work *get_workspace(struct workspace_manager *wsm, const struct crush_map *c) { struct crush_work *work; int cpus = num_online_cpus(); again: spin_lock(&wsm->ws_lock); if (!list_empty(&wsm->idle_ws)) { work = list_first_entry(&wsm->idle_ws, struct crush_work, item); list_del_init(&work->item); wsm->free_ws--; spin_unlock(&wsm->ws_lock); return work; } if (atomic_read(&wsm->total_ws) > cpus) { DEFINE_WAIT(wait); spin_unlock(&wsm->ws_lock); prepare_to_wait(&wsm->ws_wait, &wait, TASK_UNINTERRUPTIBLE); if (atomic_read(&wsm->total_ws) > cpus && !wsm->free_ws) schedule(); finish_wait(&wsm->ws_wait, &wait); goto again; } atomic_inc(&wsm->total_ws); spin_unlock(&wsm->ws_lock); work = alloc_workspace(c); if (!work) { atomic_dec(&wsm->total_ws); wake_up(&wsm->ws_wait); /* * Do not return the error but go back to waiting. We * have the initial workspace and the CRUSH computation * time is bounded so we will get it eventually. */ WARN_ON(atomic_read(&wsm->total_ws) < 1); goto again; } return work; } /* * Puts a workspace back on the list or frees it if we have enough * idle ones sitting around. */ static void put_workspace(struct workspace_manager *wsm, struct crush_work *work) { spin_lock(&wsm->ws_lock); if (wsm->free_ws <= num_online_cpus()) { list_add(&work->item, &wsm->idle_ws); wsm->free_ws++; spin_unlock(&wsm->ws_lock); goto wake; } spin_unlock(&wsm->ws_lock); free_workspace(work); atomic_dec(&wsm->total_ws); wake: if (wq_has_sleeper(&wsm->ws_wait)) wake_up(&wsm->ws_wait); } /* * osd map */ struct ceph_osdmap *ceph_osdmap_alloc(void) { struct ceph_osdmap *map; map = kzalloc(sizeof(*map), GFP_NOIO); if (!map) return NULL; map->pg_pools = RB_ROOT; map->pool_max = -1; map->pg_temp = RB_ROOT; map->primary_temp = RB_ROOT; map->pg_upmap = RB_ROOT; map->pg_upmap_items = RB_ROOT; init_workspace_manager(&map->crush_wsm); return map; } void ceph_osdmap_destroy(struct ceph_osdmap *map) { dout("osdmap_destroy %p\n", map); if (map->crush) crush_destroy(map->crush); cleanup_workspace_manager(&map->crush_wsm); while (!RB_EMPTY_ROOT(&map->pg_temp)) { struct ceph_pg_mapping *pg = rb_entry(rb_first(&map->pg_temp), struct ceph_pg_mapping, node); erase_pg_mapping(&map->pg_temp, pg); free_pg_mapping(pg); } while (!RB_EMPTY_ROOT(&map->primary_temp)) { struct ceph_pg_mapping *pg = rb_entry(rb_first(&map->primary_temp), struct ceph_pg_mapping, node); erase_pg_mapping(&map->primary_temp, pg); free_pg_mapping(pg); } while (!RB_EMPTY_ROOT(&map->pg_upmap)) { struct ceph_pg_mapping *pg = rb_entry(rb_first(&map->pg_upmap), struct ceph_pg_mapping, node); rb_erase(&pg->node, &map->pg_upmap); kfree(pg); } while (!RB_EMPTY_ROOT(&map->pg_upmap_items)) { struct ceph_pg_mapping *pg = rb_entry(rb_first(&map->pg_upmap_items), struct ceph_pg_mapping, node); rb_erase(&pg->node, &map->pg_upmap_items); kfree(pg); } while (!RB_EMPTY_ROOT(&map->pg_pools)) { struct ceph_pg_pool_info *pi = rb_entry(rb_first(&map->pg_pools), struct ceph_pg_pool_info, node); __remove_pg_pool(&map->pg_pools, pi); } kvfree(map->osd_state); kvfree(map->osd_weight); kvfree(map->osd_addr); kvfree(map->osd_primary_affinity); kfree(map); } /* * Adjust max_osd value, (re)allocate arrays. * * The new elements are properly initialized. */ static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max) { u32 *state; u32 *weight; struct ceph_entity_addr *addr; u32 to_copy; int i; dout("%s old %u new %u\n", __func__, map->max_osd, max); if (max == map->max_osd) return 0; state = kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS); weight = kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS); addr = kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS); if (!state || !weight || !addr) { kvfree(state); kvfree(weight); kvfree(addr); return -ENOMEM; } to_copy = min(map->max_osd, max); if (map->osd_state) { memcpy(state, map->osd_state, to_copy * sizeof(*state)); memcpy(weight, map->osd_weight, to_copy * sizeof(*weight)); memcpy(addr, map->osd_addr, to_copy * sizeof(*addr)); kvfree(map->osd_state); kvfree(map->osd_weight); kvfree(map->osd_addr); } map->osd_state = state; map->osd_weight = weight; map->osd_addr = addr; for (i = map->max_osd; i < max; i++) { map->osd_state[i] = 0; map->osd_weight[i] = CEPH_OSD_OUT; memset(map->osd_addr + i, 0, sizeof(*map->osd_addr)); } if (map->osd_primary_affinity) { u32 *affinity; affinity = kvmalloc(array_size(max, sizeof(*affinity)), GFP_NOFS); if (!affinity) return -ENOMEM; memcpy(affinity, map->osd_primary_affinity, to_copy * sizeof(*affinity)); kvfree(map->osd_primary_affinity); map->osd_primary_affinity = affinity; for (i = map->max_osd; i < max; i++) map->osd_primary_affinity[i] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; } map->max_osd = max; return 0; } static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) { struct crush_work *work; if (IS_ERR(crush)) return PTR_ERR(crush); work = alloc_workspace(crush); if (!work) { crush_destroy(crush); return -ENOMEM; } if (map->crush) crush_destroy(map->crush); cleanup_workspace_manager(&map->crush_wsm); map->crush = crush; add_initial_workspace(&map->crush_wsm, work); return 0; } #define OSDMAP_WRAPPER_COMPAT_VER 7 #define OSDMAP_CLIENT_DATA_COMPAT_VER 1 /* * Return 0 or error. On success, *v is set to 0 for old (v6) osdmaps, * to struct_v of the client_data section for new (v7 and above) * osdmaps. */ static int get_osdmap_client_data_v(void **p, void *end, const char *prefix, u8 *v) { u8 struct_v; ceph_decode_8_safe(p, end, struct_v, e_inval); if (struct_v >= 7) { u8 struct_compat; ceph_decode_8_safe(p, end, struct_compat, e_inval); if (struct_compat > OSDMAP_WRAPPER_COMPAT_VER) { pr_warn("got v %d cv %d > %d of %s ceph_osdmap\n", struct_v, struct_compat, OSDMAP_WRAPPER_COMPAT_VER, prefix); return -EINVAL; } *p += 4; /* ignore wrapper struct_len */ ceph_decode_8_safe(p, end, struct_v, e_inval); ceph_decode_8_safe(p, end, struct_compat, e_inval); if (struct_compat > OSDMAP_CLIENT_DATA_COMPAT_VER) { pr_warn("got v %d cv %d > %d of %s ceph_osdmap client data\n", struct_v, struct_compat, OSDMAP_CLIENT_DATA_COMPAT_VER, prefix); return -EINVAL; } *p += 4; /* ignore client data struct_len */ } else { u16 version; *p -= 1; ceph_decode_16_safe(p, end, version, e_inval); if (version < 6) { pr_warn("got v %d < 6 of %s ceph_osdmap\n", version, prefix); return -EINVAL; } /* old osdmap encoding */ struct_v = 0; } *v = struct_v; return 0; e_inval: return -EINVAL; } static int __decode_pools(void **p, void *end, struct ceph_osdmap *map, bool incremental) { u32 n; ceph_decode_32_safe(p, end, n, e_inval); while (n--) { struct ceph_pg_pool_info *pi; u64 pool; int ret; ceph_decode_64_safe(p, end, pool, e_inval); pi = lookup_pg_pool(&map->pg_pools, pool); if (!incremental || !pi) { pi = kzalloc(sizeof(*pi), GFP_NOFS); if (!pi) return -ENOMEM; RB_CLEAR_NODE(&pi->node); pi->id = pool; if (!__insert_pg_pool(&map->pg_pools, pi)) { kfree(pi); return -EEXIST; } } ret = decode_pool(p, end, pi); if (ret) return ret; } return 0; e_inval: return -EINVAL; } static int decode_pools(void **p, void *end, struct ceph_osdmap *map) { return __decode_pools(p, end, map, false); } static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map) { return __decode_pools(p, end, map, true); } typedef struct ceph_pg_mapping *(*decode_mapping_fn_t)(void **, void *, bool); static int decode_pg_mapping(void **p, void *end, struct rb_root *mapping_root, decode_mapping_fn_t fn, bool incremental) { u32 n; WARN_ON(!incremental && !fn); ceph_decode_32_safe(p, end, n, e_inval); while (n--) { struct ceph_pg_mapping *pg; struct ceph_pg pgid; int ret; ret = ceph_decode_pgid(p, end, &pgid); if (ret) return ret; pg = lookup_pg_mapping(mapping_root, &pgid); if (pg) { WARN_ON(!incremental); erase_pg_mapping(mapping_root, pg); free_pg_mapping(pg); } if (fn) { pg = fn(p, end, incremental); if (IS_ERR(pg)) return PTR_ERR(pg); if (pg) { pg->pgid = pgid; /* struct */ insert_pg_mapping(mapping_root, pg); } } } return 0; e_inval: return -EINVAL; } static struct ceph_pg_mapping *__decode_pg_temp(void **p, void *end, bool incremental) { struct ceph_pg_mapping *pg; u32 len, i; ceph_decode_32_safe(p, end, len, e_inval); if (len == 0 && incremental) return NULL; /* new_pg_temp: [] to remove */ if (len > (SIZE_MAX - sizeof(*pg)) / sizeof(u32)) return ERR_PTR(-EINVAL); ceph_decode_need(p, end, len * sizeof(u32), e_inval); pg = alloc_pg_mapping(len * sizeof(u32)); if (!pg) return ERR_PTR(-ENOMEM); pg->pg_temp.len = len; for (i = 0; i < len; i++) pg->pg_temp.osds[i] = ceph_decode_32(p); return pg; e_inval: return ERR_PTR(-EINVAL); } static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map) { return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp, false); } static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map) { return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp, true); } static struct ceph_pg_mapping *__decode_primary_temp(void **p, void *end, bool incremental) { struct ceph_pg_mapping *pg; u32 osd; ceph_decode_32_safe(p, end, osd, e_inval); if (osd == (u32)-1 && incremental) return NULL; /* new_primary_temp: -1 to remove */ pg = alloc_pg_mapping(0); if (!pg) return ERR_PTR(-ENOMEM); pg->primary_temp.osd = osd; return pg; e_inval: return ERR_PTR(-EINVAL); } static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map) { return decode_pg_mapping(p, end, &map->primary_temp, __decode_primary_temp, false); } static int decode_new_primary_temp(void **p, void *end, struct ceph_osdmap *map) { return decode_pg_mapping(p, end, &map->primary_temp, __decode_primary_temp, true); } u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd) { BUG_ON(osd >= map->max_osd); if (!map->osd_primary_affinity) return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; return map->osd_primary_affinity[osd]; } static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) { BUG_ON(osd >= map->max_osd); if (!map->osd_primary_affinity) { int i; map->osd_primary_affinity = kvmalloc( array_size(map->max_osd, sizeof(*map->osd_primary_affinity)), GFP_NOFS); if (!map->osd_primary_affinity) return -ENOMEM; for (i = 0; i < map->max_osd; i++) map->osd_primary_affinity[i] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; } map->osd_primary_affinity[osd] = aff; return 0; } static int decode_primary_affinity(void **p, void *end, struct ceph_osdmap *map) { u32 len, i; ceph_decode_32_safe(p, end, len, e_inval); if (len == 0) { kvfree(map->osd_primary_affinity); map->osd_primary_affinity = NULL; return 0; } if (len != map->max_osd) goto e_inval; ceph_decode_need(p, end, map->max_osd*sizeof(u32), e_inval); for (i = 0; i < map->max_osd; i++) { int ret; ret = set_primary_affinity(map, i, ceph_decode_32(p)); if (ret) return ret; } return 0; e_inval: return -EINVAL; } static int decode_new_primary_affinity(void **p, void *end, struct ceph_osdmap *map) { u32 n; ceph_decode_32_safe(p, end, n, e_inval); while (n--) { u32 osd, aff; int ret; ceph_decode_32_safe(p, end, osd, e_inval); ceph_decode_32_safe(p, end, aff, e_inval); ret = set_primary_affinity(map, osd, aff); if (ret) return ret; osdmap_info(map, "osd%d primary-affinity 0x%x\n", osd, aff); } return 0; e_inval: return -EINVAL; } static struct ceph_pg_mapping *__decode_pg_upmap(void **p, void *end, bool __unused) { return __decode_pg_temp(p, end, false); } static int decode_pg_upmap(void **p, void *end, struct ceph_osdmap *map) { return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap, false); } static int decode_new_pg_upmap(void **p, void *end, struct ceph_osdmap *map) { return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap, true); } static int decode_old_pg_upmap(void **p, void *end, struct ceph_osdmap *map) { return decode_pg_mapping(p, end, &map->pg_upmap, NULL, true); } static struct ceph_pg_mapping *__decode_pg_upmap_items(void **p, void *end, bool __unused) { struct ceph_pg_mapping *pg; u32 len, i; ceph_decode_32_safe(p, end, len, e_inval); if (len > (SIZE_MAX - sizeof(*pg)) / (2 * sizeof(u32))) return ERR_PTR(-EINVAL); ceph_decode_need(p, end, 2 * len * sizeof(u32), e_inval); pg = alloc_pg_mapping(2 * len * sizeof(u32)); if (!pg) return ERR_PTR(-ENOMEM); pg->pg_upmap_items.len = len; for (i = 0; i < len; i++) { pg->pg_upmap_items.from_to[i][0] = ceph_decode_32(p); pg->pg_upmap_items.from_to[i][1] = ceph_decode_32(p); } return pg; e_inval: return ERR_PTR(-EINVAL); } static int decode_pg_upmap_items(void **p, void *end, struct ceph_osdmap *map) { return decode_pg_mapping(p, end, &map->pg_upmap_items, __decode_pg_upmap_items, false); } static int decode_new_pg_upmap_items(void **p, void *end, struct ceph_osdmap *map) { return decode_pg_mapping(p, end, &map->pg_upmap_items, __decode_pg_upmap_items, true); } static int decode_old_pg_upmap_items(void **p, void *end, struct ceph_osdmap *map) { return decode_pg_mapping(p, end, &map->pg_upmap_items, NULL, true); } /* * decode a full map. */ static int osdmap_decode(void **p, void *end, bool msgr2, struct ceph_osdmap *map) { u8 struct_v; u32 epoch = 0; void *start = *p; u32 max; u32 len, i; int err; dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); err = get_osdmap_client_data_v(p, end, "full", &struct_v); if (err) goto bad; /* fsid, epoch, created, modified */ ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) + sizeof(map->created) + sizeof(map->modified), e_inval); ceph_decode_copy(p, &map->fsid, sizeof(map->fsid)); epoch = map->epoch = ceph_decode_32(p); ceph_decode_copy(p, &map->created, sizeof(map->created)); ceph_decode_copy(p, &map->modified, sizeof(map->modified)); /* pools */ err = decode_pools(p, end, map); if (err) goto bad; /* pool_name */ err = decode_pool_names(p, end, map); if (err) goto bad; ceph_decode_32_safe(p, end, map->pool_max, e_inval); ceph_decode_32_safe(p, end, map->flags, e_inval); /* max_osd */ ceph_decode_32_safe(p, end, max, e_inval); /* (re)alloc osd arrays */ err = osdmap_set_max_osd(map, max); if (err) goto bad; /* osd_state, osd_weight, osd_addrs->client_addr */ ceph_decode_need(p, end, 3*sizeof(u32) + map->max_osd*(struct_v >= 5 ? sizeof(u32) : sizeof(u8)) + sizeof(*map->osd_weight), e_inval); if (ceph_decode_32(p) != map->max_osd) goto e_inval; if (struct_v >= 5) { for (i = 0; i < map->max_osd; i++) map->osd_state[i] = ceph_decode_32(p); } else { for (i = 0; i < map->max_osd; i++) map->osd_state[i] = ceph_decode_8(p); } if (ceph_decode_32(p) != map->max_osd) goto e_inval; for (i = 0; i < map->max_osd; i++) map->osd_weight[i] = ceph_decode_32(p); if (ceph_decode_32(p) != map->max_osd) goto e_inval; for (i = 0; i < map->max_osd; i++) { struct ceph_entity_addr *addr = &map->osd_addr[i]; if (struct_v >= 8) err = ceph_decode_entity_addrvec(p, end, msgr2, addr); else err = ceph_decode_entity_addr(p, end, addr); if (err) goto bad; dout("%s osd%d addr %s\n", __func__, i, ceph_pr_addr(addr)); } /* pg_temp */ err = decode_pg_temp(p, end, map); if (err) goto bad; /* primary_temp */ if (struct_v >= 1) { err = decode_primary_temp(p, end, map); if (err) goto bad; } /* primary_affinity */ if (struct_v >= 2) { err = decode_primary_affinity(p, end, map); if (err) goto bad; } else { WARN_ON(map->osd_primary_affinity); } /* crush */ ceph_decode_32_safe(p, end, len, e_inval); err = osdmap_set_crush(map, crush_decode(*p, min(*p + len, end))); if (err) goto bad; *p += len; if (struct_v >= 3) { /* erasure_code_profiles */ ceph_decode_skip_map_of_map(p, end, string, string, string, e_inval); } if (struct_v >= 4) { err = decode_pg_upmap(p, end, map); if (err) goto bad; err = decode_pg_upmap_items(p, end, map); if (err) goto bad; } else { WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap)); WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap_items)); } /* ignore the rest */ *p = end; dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd); return 0; e_inval: err = -EINVAL; bad: pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n", err, epoch, (int)(*p - start), *p, start, end); print_hex_dump(KERN_DEBUG, "osdmap: ", DUMP_PREFIX_OFFSET, 16, 1, start, end - start, true); return err; } /* * Allocate and decode a full map. */ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2) { struct ceph_osdmap *map; int ret; map = ceph_osdmap_alloc(); if (!map) return ERR_PTR(-ENOMEM); ret = osdmap_decode(p, end, msgr2, map); if (ret) { ceph_osdmap_destroy(map); return ERR_PTR(ret); } return map; } /* * Encoding order is (new_up_client, new_state, new_weight). Need to * apply in the (new_weight, new_state, new_up_client) order, because * an incremental map may look like e.g. * * new_up_client: { osd=6, addr=... } # set osd_state and addr * new_state: { osd=6, xorstate=EXISTS } # clear osd_state */ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, bool msgr2, struct ceph_osdmap *map) { void *new_up_client; void *new_state; void *new_weight_end; u32 len; int ret; int i; new_up_client = *p; ceph_decode_32_safe(p, end, len, e_inval); for (i = 0; i < len; ++i) { struct ceph_entity_addr addr; ceph_decode_skip_32(p, end, e_inval); if (struct_v >= 7) ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr); else ret = ceph_decode_entity_addr(p, end, &addr); if (ret) return ret; } new_state = *p; ceph_decode_32_safe(p, end, len, e_inval); len *= sizeof(u32) + (struct_v >= 5 ? sizeof(u32) : sizeof(u8)); ceph_decode_need(p, end, len, e_inval); *p += len; /* new_weight */ ceph_decode_32_safe(p, end, len, e_inval); while (len--) { s32 osd; u32 w; ceph_decode_need(p, end, 2*sizeof(u32), e_inval); osd = ceph_decode_32(p); w = ceph_decode_32(p); BUG_ON(osd >= map->max_osd); osdmap_info(map, "osd%d weight 0x%x %s\n", osd, w, w == CEPH_OSD_IN ? "(in)" : (w == CEPH_OSD_OUT ? "(out)" : "")); map->osd_weight[osd] = w; /* * If we are marking in, set the EXISTS, and clear the * AUTOOUT and NEW bits. */ if (w) { map->osd_state[osd] |= CEPH_OSD_EXISTS; map->osd_state[osd] &= ~(CEPH_OSD_AUTOOUT | CEPH_OSD_NEW); } } new_weight_end = *p; /* new_state (up/down) */ *p = new_state; len = ceph_decode_32(p); while (len--) { s32 osd; u32 xorstate; osd = ceph_decode_32(p); if (struct_v >= 5) xorstate = ceph_decode_32(p); else xorstate = ceph_decode_8(p); if (xorstate == 0) xorstate = CEPH_OSD_UP; BUG_ON(osd >= map->max_osd); if ((map->osd_state[osd] & CEPH_OSD_UP) && (xorstate & CEPH_OSD_UP)) osdmap_info(map, "osd%d down\n", osd); if ((map->osd_state[osd] & CEPH_OSD_EXISTS) && (xorstate & CEPH_OSD_EXISTS)) { osdmap_info(map, "osd%d does not exist\n", osd); ret = set_primary_affinity(map, osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY); if (ret) return ret; memset(map->osd_addr + osd, 0, sizeof(*map->osd_addr)); map->osd_state[osd] = 0; } else { map->osd_state[osd] ^= xorstate; } } /* new_up_client */ *p = new_up_client; len = ceph_decode_32(p); while (len--) { s32 osd; struct ceph_entity_addr addr; osd = ceph_decode_32(p); BUG_ON(osd >= map->max_osd); if (struct_v >= 7) ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr); else ret = ceph_decode_entity_addr(p, end, &addr); if (ret) return ret; dout("%s osd%d addr %s\n", __func__, osd, ceph_pr_addr(&addr)); osdmap_info(map, "osd%d up\n", osd); map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP; map->osd_addr[osd] = addr; } *p = new_weight_end; return 0; e_inval: return -EINVAL; } /* * decode and apply an incremental map update. */ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2, struct ceph_osdmap *map) { struct ceph_fsid fsid; u32 epoch = 0; struct ceph_timespec modified; s32 len; u64 pool; __s64 new_pool_max; __s32 new_flags, max; void *start = *p; int err; u8 struct_v; dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); err = get_osdmap_client_data_v(p, end, "inc", &struct_v); if (err) goto bad; /* fsid, epoch, modified, new_pool_max, new_flags */ ceph_decode_need(p, end, sizeof(fsid) + sizeof(u32) + sizeof(modified) + sizeof(u64) + sizeof(u32), e_inval); ceph_decode_copy(p, &fsid, sizeof(fsid)); epoch = ceph_decode_32(p); BUG_ON(epoch != map->epoch+1); ceph_decode_copy(p, &modified, sizeof(modified)); new_pool_max = ceph_decode_64(p); new_flags = ceph_decode_32(p); /* full map? */ ceph_decode_32_safe(p, end, len, e_inval); if (len > 0) { dout("apply_incremental full map len %d, %p to %p\n", len, *p, end); return ceph_osdmap_decode(p, min(*p+len, end), msgr2); } /* new crush? */ ceph_decode_32_safe(p, end, len, e_inval); if (len > 0) { err = osdmap_set_crush(map, crush_decode(*p, min(*p + len, end))); if (err) goto bad; *p += len; } /* new flags? */ if (new_flags >= 0) map->flags = new_flags; if (new_pool_max >= 0) map->pool_max = new_pool_max; /* new max? */ ceph_decode_32_safe(p, end, max, e_inval); if (max >= 0) { err = osdmap_set_max_osd(map, max); if (err) goto bad; } map->epoch++; map->modified = modified; /* new_pools */ err = decode_new_pools(p, end, map); if (err) goto bad; /* new_pool_names */ err = decode_pool_names(p, end, map); if (err) goto bad; /* old_pool */ ceph_decode_32_safe(p, end, len, e_inval); while (len--) { struct ceph_pg_pool_info *pi; ceph_decode_64_safe(p, end, pool, e_inval); pi = lookup_pg_pool(&map->pg_pools, pool); if (pi) __remove_pg_pool(&map->pg_pools, pi); } /* new_up_client, new_state, new_weight */ err = decode_new_up_state_weight(p, end, struct_v, msgr2, map); if (err) goto bad; /* new_pg_temp */ err = decode_new_pg_temp(p, end, map); if (err) goto bad; /* new_primary_temp */ if (struct_v >= 1) { err = decode_new_primary_temp(p, end, map); if (err) goto bad; } /* new_primary_affinity */ if (struct_v >= 2) { err = decode_new_primary_affinity(p, end, map); if (err) goto bad; } if (struct_v >= 3) { /* new_erasure_code_profiles */ ceph_decode_skip_map_of_map(p, end, string, string, string, e_inval); /* old_erasure_code_profiles */ ceph_decode_skip_set(p, end, string, e_inval); } if (struct_v >= 4) { err = decode_new_pg_upmap(p, end, map); if (err) goto bad; err = decode_old_pg_upmap(p, end, map); if (err) goto bad; err = decode_new_pg_upmap_items(p, end, map); if (err) goto bad; err = decode_old_pg_upmap_items(p, end, map); if (err) goto bad; } /* ignore the rest */ *p = end; dout("inc osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd); return map; e_inval: err = -EINVAL; bad: pr_err("corrupt inc osdmap (%d) epoch %d off %d (%p of %p-%p)\n", err, epoch, (int)(*p - start), *p, start, end); print_hex_dump(KERN_DEBUG, "osdmap: ", DUMP_PREFIX_OFFSET, 16, 1, start, end - start, true); return ERR_PTR(err); } void ceph_oloc_copy(struct ceph_object_locator *dest, const struct ceph_object_locator *src) { ceph_oloc_destroy(dest); dest->pool = src->pool; if (src->pool_ns) dest->pool_ns = ceph_get_string(src->pool_ns); else dest->pool_ns = NULL; } EXPORT_SYMBOL(ceph_oloc_copy); void ceph_oloc_destroy(struct ceph_object_locator *oloc) { ceph_put_string(oloc->pool_ns); } EXPORT_SYMBOL(ceph_oloc_destroy); void ceph_oid_copy(struct ceph_object_id *dest, const struct ceph_object_id *src) { ceph_oid_destroy(dest); if (src->name != src->inline_name) { /* very rare, see ceph_object_id definition */ dest->name = kmalloc(src->name_len + 1, GFP_NOIO | __GFP_NOFAIL); } else { dest->name = dest->inline_name; } memcpy(dest->name, src->name, src->name_len + 1); dest->name_len = src->name_len; } EXPORT_SYMBOL(ceph_oid_copy); static __printf(2, 0) int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap) { int len; WARN_ON(!ceph_oid_empty(oid)); len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap); if (len >= sizeof(oid->inline_name)) return len; oid->name_len = len; return 0; } /* * If oid doesn't fit into inline buffer, BUG. */ void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...) { va_list ap; va_start(ap, fmt); BUG_ON(oid_printf_vargs(oid, fmt, ap)); va_end(ap); } EXPORT_SYMBOL(ceph_oid_printf); static __printf(3, 0) int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp, const char *fmt, va_list ap) { va_list aq; int len; va_copy(aq, ap); len = oid_printf_vargs(oid, fmt, aq); va_end(aq); if (len) { char *external_name; external_name = kmalloc(len + 1, gfp); if (!external_name) return -ENOMEM; oid->name = external_name; WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len); oid->name_len = len; } return 0; } /* * If oid doesn't fit into inline buffer, allocate. */ int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp, const char *fmt, ...) { va_list ap; int ret; va_start(ap, fmt); ret = oid_aprintf_vargs(oid, gfp, fmt, ap); va_end(ap); return ret; } EXPORT_SYMBOL(ceph_oid_aprintf); void ceph_oid_destroy(struct ceph_object_id *oid) { if (oid->name != oid->inline_name) kfree(oid->name); } EXPORT_SYMBOL(ceph_oid_destroy); /* * osds only */ static bool __osds_equal(const struct ceph_osds *lhs, const struct ceph_osds *rhs) { if (lhs->size == rhs->size && !memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0]))) return true; return false; } /* * osds + primary */ static bool osds_equal(const struct ceph_osds *lhs, const struct ceph_osds *rhs) { if (__osds_equal(lhs, rhs) && lhs->primary == rhs->primary) return true; return false; } static bool osds_valid(const struct ceph_osds *set) { /* non-empty set */ if (set->size > 0 && set->primary >= 0) return true; /* empty can_shift_osds set */ if (!set->size && set->primary == -1) return true; /* empty !can_shift_osds set - all NONE */ if (set->size > 0 && set->primary == -1) { int i; for (i = 0; i < set->size; i++) { if (set->osds[i] != CRUSH_ITEM_NONE) break; } if (i == set->size) return true; } return false; } void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src) { memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0])); dest->size = src->size; dest->primary = src->primary; } bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num, u32 new_pg_num) { int old_bits = calc_bits_of(old_pg_num); int old_mask = (1 << old_bits) - 1; int n; WARN_ON(pgid->seed >= old_pg_num); if (new_pg_num <= old_pg_num) return false; for (n = 1; ; n++) { int next_bit = n << (old_bits - 1); u32 s = next_bit | pgid->seed; if (s < old_pg_num || s == pgid->seed) continue; if (s >= new_pg_num) break; s = ceph_stable_mod(s, old_pg_num, old_mask); if (s == pgid->seed) return true; } return false; } bool ceph_is_new_interval(const struct ceph_osds *old_acting, const struct ceph_osds *new_acting, const struct ceph_osds *old_up, const struct ceph_osds *new_up, int old_size, int new_size, int old_min_size, int new_min_size, u32 old_pg_num, u32 new_pg_num, bool old_sort_bitwise, bool new_sort_bitwise, bool old_recovery_deletes, bool new_recovery_deletes, const struct ceph_pg *pgid) { return !osds_equal(old_acting, new_acting) || !osds_equal(old_up, new_up) || old_size != new_size || old_min_size != new_min_size || ceph_pg_is_split(pgid, old_pg_num, new_pg_num) || old_sort_bitwise != new_sort_bitwise || old_recovery_deletes != new_recovery_deletes; } static int calc_pg_rank(int osd, const struct ceph_osds *acting) { int i; for (i = 0; i < acting->size; i++) { if (acting->osds[i] == osd) return i; } return -1; } static bool primary_changed(const struct ceph_osds *old_acting, const struct ceph_osds *new_acting) { if (!old_acting->size && !new_acting->size) return false; /* both still empty */ if (!old_acting->size ^ !new_acting->size) return true; /* was empty, now not, or vice versa */ if (old_acting->primary != new_acting->primary) return true; /* primary changed */ if (calc_pg_rank(old_acting->primary, old_acting) != calc_pg_rank(new_acting->primary, new_acting)) return true; return false; /* same primary (tho replicas may have changed) */ } bool ceph_osds_changed(const struct ceph_osds *old_acting, const struct ceph_osds *new_acting, bool any_change) { if (primary_changed(old_acting, new_acting)) return true; if (any_change && !__osds_equal(old_acting, new_acting)) return true; return false; } /* * Map an object into a PG. * * Should only be called with target_oid and target_oloc (as opposed to * base_oid and base_oloc), since tiering isn't taken into account. */ void __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, const struct ceph_object_id *oid, const struct ceph_object_locator *oloc, struct ceph_pg *raw_pgid) { WARN_ON(pi->id != oloc->pool); if (!oloc->pool_ns) { raw_pgid->pool = oloc->pool; raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name, oid->name_len); dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name, raw_pgid->pool, raw_pgid->seed); } else { char stack_buf[256]; char *buf = stack_buf; int nsl = oloc->pool_ns->len; size_t total = nsl + 1 + oid->name_len; if (total > sizeof(stack_buf)) buf = kmalloc(total, GFP_NOIO | __GFP_NOFAIL); memcpy(buf, oloc->pool_ns->str, nsl); buf[nsl] = '\037'; memcpy(buf + nsl + 1, oid->name, oid->name_len); raw_pgid->pool = oloc->pool; raw_pgid->seed = ceph_str_hash(pi->object_hash, buf, total); if (buf != stack_buf) kfree(buf); dout("%s %s ns %.*s -> raw_pgid %llu.%x\n", __func__, oid->name, nsl, oloc->pool_ns->str, raw_pgid->pool, raw_pgid->seed); } } int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, const struct ceph_object_id *oid, const struct ceph_object_locator *oloc, struct ceph_pg *raw_pgid) { struct ceph_pg_pool_info *pi; pi = ceph_pg_pool_by_id(osdmap, oloc->pool); if (!pi) return -ENOENT; __ceph_object_locator_to_pg(pi, oid, oloc, raw_pgid); return 0; } EXPORT_SYMBOL(ceph_object_locator_to_pg); /* * Map a raw PG (full precision ps) into an actual PG. */ static void raw_pg_to_pg(struct ceph_pg_pool_info *pi, const struct ceph_pg *raw_pgid, struct ceph_pg *pgid) { pgid->pool = raw_pgid->pool; pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num, pi->pg_num_mask); } /* * Map a raw PG (full precision ps) into a placement ps (placement * seed). Include pool id in that value so that different pools don't * use the same seeds. */ static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi, const struct ceph_pg *raw_pgid) { if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) { /* hash pool id and seed so that pool PGs do not overlap */ return crush_hash32_2(CRUSH_HASH_RJENKINS1, ceph_stable_mod(raw_pgid->seed, pi->pgp_num, pi->pgp_num_mask), raw_pgid->pool); } else { /* * legacy behavior: add ps and pool together. this is * not a great approach because the PGs from each pool * will overlap on top of each other: 0.5 == 1.4 == * 2.3 == ... */ return ceph_stable_mod(raw_pgid->seed, pi->pgp_num, pi->pgp_num_mask) + (unsigned)raw_pgid->pool; } } /* * Magic value used for a "default" fallback choose_args, used if the * crush_choose_arg_map passed to do_crush() does not exist. If this * also doesn't exist, fall back to canonical weights. */ #define CEPH_DEFAULT_CHOOSE_ARGS -1 static int do_crush(struct ceph_osdmap *map, int ruleno, int x, int *result, int result_max, const __u32 *weight, int weight_max, s64 choose_args_index) { struct crush_choose_arg_map *arg_map; struct crush_work *work; int r; BUG_ON(result_max > CEPH_PG_MAX_SIZE); arg_map = lookup_choose_arg_map(&map->crush->choose_args, choose_args_index); if (!arg_map) arg_map = lookup_choose_arg_map(&map->crush->choose_args, CEPH_DEFAULT_CHOOSE_ARGS); work = get_workspace(&map->crush_wsm, map->crush); r = crush_do_rule(map->crush, ruleno, x, result, result_max, weight, weight_max, work, arg_map ? arg_map->args : NULL); put_workspace(&map->crush_wsm, work); return r; } static void remove_nonexistent_osds(struct ceph_osdmap *osdmap, struct ceph_pg_pool_info *pi, struct ceph_osds *set) { int i; if (ceph_can_shift_osds(pi)) { int removed = 0; /* shift left */ for (i = 0; i < set->size; i++) { if (!ceph_osd_exists(osdmap, set->osds[i])) { removed++; continue; } if (removed) set->osds[i - removed] = set->osds[i]; } set->size -= removed; } else { /* set dne devices to NONE */ for (i = 0; i < set->size; i++) { if (!ceph_osd_exists(osdmap, set->osds[i])) set->osds[i] = CRUSH_ITEM_NONE; } } } /* * Calculate raw set (CRUSH output) for given PG and filter out * nonexistent OSDs. ->primary is undefined for a raw set. * * Placement seed (CRUSH input) is returned through @ppps. */ static void pg_to_raw_osds(struct ceph_osdmap *osdmap, struct ceph_pg_pool_info *pi, const struct ceph_pg *raw_pgid, struct ceph_osds *raw, u32 *ppps) { u32 pps = raw_pg_to_pps(pi, raw_pgid); int ruleno; int len; ceph_osds_init(raw); if (ppps) *ppps = pps; ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type, pi->size); if (ruleno < 0) { pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n", pi->id, pi->crush_ruleset, pi->type, pi->size); return; } if (pi->size > ARRAY_SIZE(raw->osds)) { pr_err_ratelimited("pool %lld ruleset %d type %d too wide: size %d > %zu\n", pi->id, pi->crush_ruleset, pi->type, pi->size, ARRAY_SIZE(raw->osds)); return; } len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size, osdmap->osd_weight, osdmap->max_osd, pi->id); if (len < 0) { pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", len, ruleno, pi->id, pi->crush_ruleset, pi->type, pi->size); return; } raw->size = len; remove_nonexistent_osds(osdmap, pi, raw); } /* apply pg_upmap[_items] mappings */ static void apply_upmap(struct ceph_osdmap *osdmap, const struct ceph_pg *pgid, struct ceph_osds *raw) { struct ceph_pg_mapping *pg; int i, j; pg = lookup_pg_mapping(&osdmap->pg_upmap, pgid); if (pg) { /* make sure targets aren't marked out */ for (i = 0; i < pg->pg_upmap.len; i++) { int osd = pg->pg_upmap.osds[i]; if (osd != CRUSH_ITEM_NONE && osd < osdmap->max_osd && osdmap->osd_weight[osd] == 0) { /* reject/ignore explicit mapping */ return; } } for (i = 0; i < pg->pg_upmap.len; i++) raw->osds[i] = pg->pg_upmap.osds[i]; raw->size = pg->pg_upmap.len; /* check and apply pg_upmap_items, if any */ } pg = lookup_pg_mapping(&osdmap->pg_upmap_items, pgid); if (pg) { /* * Note: this approach does not allow a bidirectional swap, * e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1]. */ for (i = 0; i < pg->pg_upmap_items.len; i++) { int from = pg->pg_upmap_items.from_to[i][0]; int to = pg->pg_upmap_items.from_to[i][1]; int pos = -1; bool exists = false; /* make sure replacement doesn't already appear */ for (j = 0; j < raw->size; j++) { int osd = raw->osds[j]; if (osd == to) { exists = true; break; } /* ignore mapping if target is marked out */ if (osd == from && pos < 0 && !(to != CRUSH_ITEM_NONE && to < osdmap->max_osd && osdmap->osd_weight[to] == 0)) { pos = j; } } if (!exists && pos >= 0) raw->osds[pos] = to; } } } /* * Given raw set, calculate up set and up primary. By definition of an * up set, the result won't contain nonexistent or down OSDs. * * This is done in-place - on return @set is the up set. If it's * empty, ->primary will remain undefined. */ static void raw_to_up_osds(struct ceph_osdmap *osdmap, struct ceph_pg_pool_info *pi, struct ceph_osds *set) { int i; /* ->primary is undefined for a raw set */ BUG_ON(set->primary != -1); if (ceph_can_shift_osds(pi)) { int removed = 0; /* shift left */ for (i = 0; i < set->size; i++) { if (ceph_osd_is_down(osdmap, set->osds[i])) { removed++; continue; } if (removed) set->osds[i - removed] = set->osds[i]; } set->size -= removed; if (set->size > 0) set->primary = set->osds[0]; } else { /* set down/dne devices to NONE */ for (i = set->size - 1; i >= 0; i--) { if (ceph_osd_is_down(osdmap, set->osds[i])) set->osds[i] = CRUSH_ITEM_NONE; else set->primary = set->osds[i]; } } } static void apply_primary_affinity(struct ceph_osdmap *osdmap, struct ceph_pg_pool_info *pi, u32 pps, struct ceph_osds *up) { int i; int pos = -1; /* * Do we have any non-default primary_affinity values for these * osds? */ if (!osdmap->osd_primary_affinity) return; for (i = 0; i < up->size; i++) { int osd = up->osds[i]; if (osd != CRUSH_ITEM_NONE && osdmap->osd_primary_affinity[osd] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) { break; } } if (i == up->size) return; /* * Pick the primary. Feed both the seed (for the pg) and the * osd into the hash/rng so that a proportional fraction of an * osd's pgs get rejected as primary. */ for (i = 0; i < up->size; i++) { int osd = up->osds[i]; u32 aff; if (osd == CRUSH_ITEM_NONE) continue; aff = osdmap->osd_primary_affinity[osd]; if (aff < CEPH_OSD_MAX_PRIMARY_AFFINITY && (crush_hash32_2(CRUSH_HASH_RJENKINS1, pps, osd) >> 16) >= aff) { /* * We chose not to use this primary. Note it * anyway as a fallback in case we don't pick * anyone else, but keep looking. */ if (pos < 0) pos = i; } else { pos = i; break; } } if (pos < 0) return; up->primary = up->osds[pos]; if (ceph_can_shift_osds(pi) && pos > 0) { /* move the new primary to the front */ for (i = pos; i > 0; i--) up->osds[i] = up->osds[i - 1]; up->osds[0] = up->primary; } } /* * Get pg_temp and primary_temp mappings for given PG. * * Note that a PG may have none, only pg_temp, only primary_temp or * both pg_temp and primary_temp mappings. This means @temp isn't * always a valid OSD set on return: in the "only primary_temp" case, * @temp will have its ->primary >= 0 but ->size == 0. */ static void get_temp_osds(struct ceph_osdmap *osdmap, struct ceph_pg_pool_info *pi, const struct ceph_pg *pgid, struct ceph_osds *temp) { struct ceph_pg_mapping *pg; int i; ceph_osds_init(temp); /* pg_temp? */ pg = lookup_pg_mapping(&osdmap->pg_temp, pgid); if (pg) { for (i = 0; i < pg->pg_temp.len; i++) { if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { if (ceph_can_shift_osds(pi)) continue; temp->osds[temp->size++] = CRUSH_ITEM_NONE; } else { temp->osds[temp->size++] = pg->pg_temp.osds[i]; } } /* apply pg_temp's primary */ for (i = 0; i < temp->size; i++) { if (temp->osds[i] != CRUSH_ITEM_NONE) { temp->primary = temp->osds[i]; break; } } } /* primary_temp? */ pg = lookup_pg_mapping(&osdmap->primary_temp, pgid); if (pg) temp->primary = pg->primary_temp.osd; } /* * Map a PG to its acting set as well as its up set. * * Acting set is used for data mapping purposes, while up set can be * recorded for detecting interval changes and deciding whether to * resend a request. */ void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap, struct ceph_pg_pool_info *pi, const struct ceph_pg *raw_pgid, struct ceph_osds *up, struct ceph_osds *acting) { struct ceph_pg pgid; u32 pps; WARN_ON(pi->id != raw_pgid->pool); raw_pg_to_pg(pi, raw_pgid, &pgid); pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps); apply_upmap(osdmap, &pgid, up); raw_to_up_osds(osdmap, pi, up); apply_primary_affinity(osdmap, pi, pps, up); get_temp_osds(osdmap, pi, &pgid, acting); if (!acting->size) { memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0])); acting->size = up->size; if (acting->primary == -1) acting->primary = up->primary; } WARN_ON(!osds_valid(up) || !osds_valid(acting)); } bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap, struct ceph_pg_pool_info *pi, const struct ceph_pg *raw_pgid, struct ceph_spg *spgid) { struct ceph_pg pgid; struct ceph_osds up, acting; int i; WARN_ON(pi->id != raw_pgid->pool); raw_pg_to_pg(pi, raw_pgid, &pgid); if (ceph_can_shift_osds(pi)) { spgid->pgid = pgid; /* struct */ spgid->shard = CEPH_SPG_NOSHARD; return true; } ceph_pg_to_up_acting_osds(osdmap, pi, &pgid, &up, &acting); for (i = 0; i < acting.size; i++) { if (acting.osds[i] == acting.primary) { spgid->pgid = pgid; /* struct */ spgid->shard = i; return true; } } return false; } /* * Return acting primary for given PG, or -1 if none. */ int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, const struct ceph_pg *raw_pgid) { struct ceph_pg_pool_info *pi; struct ceph_osds up, acting; pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool); if (!pi) return -1; ceph_pg_to_up_acting_osds(osdmap, pi, raw_pgid, &up, &acting); return acting.primary; } EXPORT_SYMBOL(ceph_pg_to_acting_primary); static struct crush_loc_node *alloc_crush_loc(size_t type_name_len, size_t name_len) { struct crush_loc_node *loc; loc = kmalloc(sizeof(*loc) + type_name_len + name_len + 2, GFP_NOIO); if (!loc) return NULL; RB_CLEAR_NODE(&loc->cl_node); return loc; } static void free_crush_loc(struct crush_loc_node *loc) { WARN_ON(!RB_EMPTY_NODE(&loc->cl_node)); kfree(loc); } static int crush_loc_compare(const struct crush_loc *loc1, const struct crush_loc *loc2) { return strcmp(loc1->cl_type_name, loc2->cl_type_name) ?: strcmp(loc1->cl_name, loc2->cl_name); } DEFINE_RB_FUNCS2(crush_loc, struct crush_loc_node, cl_loc, crush_loc_compare, RB_BYPTR, const struct crush_loc *, cl_node) /* * Parses a set of <bucket type name>':'<bucket name> pairs separated * by '|', e.g. "rack:foo1|rack:foo2|datacenter:bar". * * Note that @crush_location is modified by strsep(). */ int ceph_parse_crush_location(char *crush_location, struct rb_root *locs) { struct crush_loc_node *loc; const char *type_name, *name, *colon; size_t type_name_len, name_len; dout("%s '%s'\n", __func__, crush_location); while ((type_name = strsep(&crush_location, "|"))) { colon = strchr(type_name, ':'); if (!colon) return -EINVAL; type_name_len = colon - type_name; if (type_name_len == 0) return -EINVAL; name = colon + 1; name_len = strlen(name); if (name_len == 0) return -EINVAL; loc = alloc_crush_loc(type_name_len, name_len); if (!loc) return -ENOMEM; loc->cl_loc.cl_type_name = loc->cl_data; memcpy(loc->cl_loc.cl_type_name, type_name, type_name_len); loc->cl_loc.cl_type_name[type_name_len] = '\0'; loc->cl_loc.cl_name = loc->cl_data + type_name_len + 1; memcpy(loc->cl_loc.cl_name, name, name_len); loc->cl_loc.cl_name[name_len] = '\0'; if (!__insert_crush_loc(locs, loc)) { free_crush_loc(loc); return -EEXIST; } dout("%s type_name '%s' name '%s'\n", __func__, loc->cl_loc.cl_type_name, loc->cl_loc.cl_name); } return 0; } int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2) { struct rb_node *n1 = rb_first(locs1); struct rb_node *n2 = rb_first(locs2); int ret; for ( ; n1 && n2; n1 = rb_next(n1), n2 = rb_next(n2)) { struct crush_loc_node *loc1 = rb_entry(n1, struct crush_loc_node, cl_node); struct crush_loc_node *loc2 = rb_entry(n2, struct crush_loc_node, cl_node); ret = crush_loc_compare(&loc1->cl_loc, &loc2->cl_loc); if (ret) return ret; } if (!n1 && n2) return -1; if (n1 && !n2) return 1; return 0; } void ceph_clear_crush_locs(struct rb_root *locs) { while (!RB_EMPTY_ROOT(locs)) { struct crush_loc_node *loc = rb_entry(rb_first(locs), struct crush_loc_node, cl_node); erase_crush_loc(locs, loc); free_crush_loc(loc); } } /* * [a-zA-Z0-9-_.]+ */ static bool is_valid_crush_name(const char *name) { do { if (!('a' <= *name && *name <= 'z') && !('A' <= *name && *name <= 'Z') && !('0' <= *name && *name <= '9') && *name != '-' && *name != '_' && *name != '.') return false; } while (*++name != '\0'); return true; } /* * Gets the parent of an item. Returns its id (<0 because the * parent is always a bucket), type id (>0 for the same reason, * via @parent_type_id) and location (via @parent_loc). If no * parent, returns 0. * * Does a linear search, as there are no parent pointers of any * kind. Note that the result is ambiguous for items that occur * multiple times in the map. */ static int get_immediate_parent(struct crush_map *c, int id, u16 *parent_type_id, struct crush_loc *parent_loc) { struct crush_bucket *b; struct crush_name_node *type_cn, *cn; int i, j; for (i = 0; i < c->max_buckets; i++) { b = c->buckets[i]; if (!b) continue; /* ignore per-class shadow hierarchy */ cn = lookup_crush_name(&c->names, b->id); if (!cn || !is_valid_crush_name(cn->cn_name)) continue; for (j = 0; j < b->size; j++) { if (b->items[j] != id) continue; *parent_type_id = b->type; type_cn = lookup_crush_name(&c->type_names, b->type); parent_loc->cl_type_name = type_cn->cn_name; parent_loc->cl_name = cn->cn_name; return b->id; } } return 0; /* no parent */ } /* * Calculates the locality/distance from an item to a client * location expressed in terms of CRUSH hierarchy as a set of * (bucket type name, bucket name) pairs. Specifically, looks * for the lowest-valued bucket type for which the location of * @id matches one of the locations in @locs, so for standard * bucket types (host = 1, rack = 3, datacenter = 8, zone = 9) * a matching host is closer than a matching rack and a matching * data center is closer than a matching zone. * * Specifying multiple locations (a "multipath" location) such * as "rack=foo1 rack=foo2 datacenter=bar" is allowed -- @locs * is a multimap. The locality will be: * * - 3 for OSDs in racks foo1 and foo2 * - 8 for OSDs in data center bar * - -1 for all other OSDs * * The lowest possible bucket type is 1, so the best locality * for an OSD is 1 (i.e. a matching host). Locality 0 would be * the OSD itself. */ int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id, struct rb_root *locs) { struct crush_loc loc; u16 type_id; /* * Instead of repeated get_immediate_parent() calls, * the location of @id could be obtained with a single * depth-first traversal. */ for (;;) { id = get_immediate_parent(osdmap->crush, id, &type_id, &loc); if (id >= 0) return -1; /* not local */ if (lookup_crush_loc(locs, &loc)) return type_id; } }
13 162 2976 83 3014 3016 399 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef RQ_QOS_H #define RQ_QOS_H #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/blk_types.h> #include <linux/atomic.h> #include <linux/wait.h> #include <linux/blk-mq.h> #include "blk-mq-debugfs.h" struct blk_mq_debugfs_attr; enum rq_qos_id { RQ_QOS_WBT, RQ_QOS_LATENCY, RQ_QOS_COST, }; struct rq_wait { wait_queue_head_t wait; atomic_t inflight; }; struct rq_qos { const struct rq_qos_ops *ops; struct gendisk *disk; enum rq_qos_id id; struct rq_qos *next; #ifdef CONFIG_BLK_DEBUG_FS struct dentry *debugfs_dir; #endif }; struct rq_qos_ops { void (*throttle)(struct rq_qos *, struct bio *); void (*track)(struct rq_qos *, struct request *, struct bio *); void (*merge)(struct rq_qos *, struct request *, struct bio *); void (*issue)(struct rq_qos *, struct request *); void (*requeue)(struct rq_qos *, struct request *); void (*done)(struct rq_qos *, struct request *); void (*done_bio)(struct rq_qos *, struct bio *); void (*cleanup)(struct rq_qos *, struct bio *); void (*queue_depth_changed)(struct rq_qos *); void (*exit)(struct rq_qos *); const struct blk_mq_debugfs_attr *debugfs_attrs; }; struct rq_depth { unsigned int max_depth; int scale_step; bool scaled_max; unsigned int queue_depth; unsigned int default_depth; }; static inline struct rq_qos *rq_qos_id(struct request_queue *q, enum rq_qos_id id) { struct rq_qos *rqos; for (rqos = q->rq_qos; rqos; rqos = rqos->next) { if (rqos->id == id) break; } return rqos; } static inline struct rq_qos *wbt_rq_qos(struct request_queue *q) { return rq_qos_id(q, RQ_QOS_WBT); } static inline struct rq_qos *iolat_rq_qos(struct request_queue *q) { return rq_qos_id(q, RQ_QOS_LATENCY); } static inline void rq_wait_init(struct rq_wait *rq_wait) { atomic_set(&rq_wait->inflight, 0); init_waitqueue_head(&rq_wait->wait); } int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, const struct rq_qos_ops *ops); void rq_qos_del(struct rq_qos *rqos); typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data); typedef void (cleanup_cb_t)(struct rq_wait *rqw, void *private_data); void rq_qos_wait(struct rq_wait *rqw, void *private_data, acquire_inflight_cb_t *acquire_inflight_cb, cleanup_cb_t *cleanup_cb); bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit); bool rq_depth_scale_up(struct rq_depth *rqd); bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle); bool rq_depth_calc_max_depth(struct rq_depth *rqd); void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio); void __rq_qos_done(struct rq_qos *rqos, struct request *rq); void __rq_qos_issue(struct rq_qos *rqos, struct request *rq); void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq); void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio); void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio); void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio); void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio); void __rq_qos_queue_depth_changed(struct rq_qos *rqos); static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio) { if (q->rq_qos) __rq_qos_cleanup(q->rq_qos, bio); } static inline void rq_qos_done(struct request_queue *q, struct request *rq) { if (q->rq_qos && !blk_rq_is_passthrough(rq)) __rq_qos_done(q->rq_qos, rq); } static inline void rq_qos_issue(struct request_queue *q, struct request *rq) { if (q->rq_qos) __rq_qos_issue(q->rq_qos, rq); } static inline void rq_qos_requeue(struct request_queue *q, struct request *rq) { if (q->rq_qos) __rq_qos_requeue(q->rq_qos, rq); } static inline void rq_qos_done_bio(struct bio *bio) { if (bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) || bio_flagged(bio, BIO_QOS_MERGED))) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); if (q->rq_qos) __rq_qos_done_bio(q->rq_qos, bio); } } static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) { if (q->rq_qos) { bio_set_flag(bio, BIO_QOS_THROTTLED); __rq_qos_throttle(q->rq_qos, bio); } } static inline void rq_qos_track(struct request_queue *q, struct request *rq, struct bio *bio) { if (q->rq_qos) __rq_qos_track(q->rq_qos, rq, bio); } static inline void rq_qos_merge(struct request_queue *q, struct request *rq, struct bio *bio) { if (q->rq_qos) { bio_set_flag(bio, BIO_QOS_MERGED); __rq_qos_merge(q->rq_qos, rq, bio); } } static inline void rq_qos_queue_depth_changed(struct request_queue *q) { if (q->rq_qos) __rq_qos_queue_depth_changed(q->rq_qos); } void rq_qos_exit(struct request_queue *); #endif
171 890 858 5 2 1 3 2 8 2 4 3 880 671 671 245 21 14 242 2 7 223 1 224 179 179 122 122 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2011 IBM Corporation * * Author: * Mimi Zohar <zohar@us.ibm.com> */ #include <linux/module.h> #include <linux/init.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/xattr.h> #include <linux/magic.h> #include <linux/ima.h> #include <linux/evm.h> #include <linux/fsverity.h> #include <keys/system_keyring.h> #include <uapi/linux/fsverity.h> #include "ima.h" #ifdef CONFIG_IMA_APPRAISE_BOOTPARAM static char *ima_appraise_cmdline_default __initdata; core_param(ima_appraise, ima_appraise_cmdline_default, charp, 0); void __init ima_appraise_parse_cmdline(void) { const char *str = ima_appraise_cmdline_default; bool sb_state = arch_ima_get_secureboot(); int appraisal_state = ima_appraise; if (!str) return; if (strncmp(str, "off", 3) == 0) appraisal_state = 0; else if (strncmp(str, "log", 3) == 0) appraisal_state = IMA_APPRAISE_LOG; else if (strncmp(str, "fix", 3) == 0) appraisal_state = IMA_APPRAISE_FIX; else if (strncmp(str, "enforce", 7) == 0) appraisal_state = IMA_APPRAISE_ENFORCE; else pr_err("invalid \"%s\" appraise option", str); /* If appraisal state was changed, but secure boot is enabled, * keep its default */ if (sb_state) { if (!(appraisal_state & IMA_APPRAISE_ENFORCE)) pr_info("Secure boot enabled: ignoring ima_appraise=%s option", str); } else { ima_appraise = appraisal_state; } } #endif /* * is_ima_appraise_enabled - return appraise status * * Only return enabled, if not in ima_appraise="fix" or "log" modes. */ bool is_ima_appraise_enabled(void) { return ima_appraise & IMA_APPRAISE_ENFORCE; } /* * ima_must_appraise - set appraise flag * * Return 1 to appraise or hash */ int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode, int mask, enum ima_hooks func) { u32 secid; if (!ima_appraise) return 0; security_current_getsecid_subj(&secid); return ima_match_policy(idmap, inode, current_cred(), secid, func, mask, IMA_APPRAISE | IMA_HASH, NULL, NULL, NULL, NULL); } static int ima_fix_xattr(struct dentry *dentry, struct integrity_iint_cache *iint) { int rc, offset; u8 algo = iint->ima_hash->algo; if (algo <= HASH_ALGO_SHA1) { offset = 1; iint->ima_hash->xattr.sha1.type = IMA_XATTR_DIGEST; } else { offset = 0; iint->ima_hash->xattr.ng.type = IMA_XATTR_DIGEST_NG; iint->ima_hash->xattr.ng.algo = algo; } rc = __vfs_setxattr_noperm(&nop_mnt_idmap, dentry, XATTR_NAME_IMA, &iint->ima_hash->xattr.data[offset], (sizeof(iint->ima_hash->xattr) - offset) + iint->ima_hash->length, 0); return rc; } /* Return specific func appraised cached result */ enum integrity_status ima_get_cache_status(struct integrity_iint_cache *iint, enum ima_hooks func) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: return iint->ima_mmap_status; case BPRM_CHECK: return iint->ima_bprm_status; case CREDS_CHECK: return iint->ima_creds_status; case FILE_CHECK: case POST_SETATTR: return iint->ima_file_status; case MODULE_CHECK ... MAX_CHECK - 1: default: return iint->ima_read_status; } } static void ima_set_cache_status(struct integrity_iint_cache *iint, enum ima_hooks func, enum integrity_status status) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: iint->ima_mmap_status = status; break; case BPRM_CHECK: iint->ima_bprm_status = status; break; case CREDS_CHECK: iint->ima_creds_status = status; break; case FILE_CHECK: case POST_SETATTR: iint->ima_file_status = status; break; case MODULE_CHECK ... MAX_CHECK - 1: default: iint->ima_read_status = status; break; } } static void ima_cache_flags(struct integrity_iint_cache *iint, enum ima_hooks func) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: iint->flags |= (IMA_MMAP_APPRAISED | IMA_APPRAISED); break; case BPRM_CHECK: iint->flags |= (IMA_BPRM_APPRAISED | IMA_APPRAISED); break; case CREDS_CHECK: iint->flags |= (IMA_CREDS_APPRAISED | IMA_APPRAISED); break; case FILE_CHECK: case POST_SETATTR: iint->flags |= (IMA_FILE_APPRAISED | IMA_APPRAISED); break; case MODULE_CHECK ... MAX_CHECK - 1: default: iint->flags |= (IMA_READ_APPRAISED | IMA_APPRAISED); break; } } enum hash_algo ima_get_hash_algo(const struct evm_ima_xattr_data *xattr_value, int xattr_len) { struct signature_v2_hdr *sig; enum hash_algo ret; if (!xattr_value || xattr_len < 2) /* return default hash algo */ return ima_hash_algo; switch (xattr_value->type) { case IMA_VERITY_DIGSIG: sig = (typeof(sig))xattr_value; if (sig->version != 3 || xattr_len <= sizeof(*sig) || sig->hash_algo >= HASH_ALGO__LAST) return ima_hash_algo; return sig->hash_algo; case EVM_IMA_XATTR_DIGSIG: sig = (typeof(sig))xattr_value; if (sig->version != 2 || xattr_len <= sizeof(*sig) || sig->hash_algo >= HASH_ALGO__LAST) return ima_hash_algo; return sig->hash_algo; case IMA_XATTR_DIGEST_NG: /* first byte contains algorithm id */ ret = xattr_value->data[0]; if (ret < HASH_ALGO__LAST) return ret; break; case IMA_XATTR_DIGEST: /* this is for backward compatibility */ if (xattr_len == 21) { unsigned int zero = 0; if (!memcmp(&xattr_value->data[16], &zero, 4)) return HASH_ALGO_MD5; else return HASH_ALGO_SHA1; } else if (xattr_len == 17) return HASH_ALGO_MD5; break; } /* return default hash algo */ return ima_hash_algo; } int ima_read_xattr(struct dentry *dentry, struct evm_ima_xattr_data **xattr_value, int xattr_len) { int ret; ret = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, XATTR_NAME_IMA, (char **)xattr_value, xattr_len, GFP_NOFS); if (ret == -EOPNOTSUPP) ret = 0; return ret; } /* * calc_file_id_hash - calculate the hash of the ima_file_id struct data * @type: xattr type [enum evm_ima_xattr_type] * @algo: hash algorithm [enum hash_algo] * @digest: pointer to the digest to be hashed * @hash: (out) pointer to the hash * * IMA signature version 3 disambiguates the data that is signed by * indirectly signing the hash of the ima_file_id structure data. * * Signing the ima_file_id struct is currently only supported for * IMA_VERITY_DIGSIG type xattrs. * * Return 0 on success, error code otherwise. */ static int calc_file_id_hash(enum evm_ima_xattr_type type, enum hash_algo algo, const u8 *digest, struct ima_digest_data *hash) { struct ima_file_id file_id = { .hash_type = IMA_VERITY_DIGSIG, .hash_algorithm = algo}; unsigned int unused = HASH_MAX_DIGESTSIZE - hash_digest_size[algo]; if (type != IMA_VERITY_DIGSIG) return -EINVAL; memcpy(file_id.hash, digest, hash_digest_size[algo]); hash->algo = algo; hash->length = hash_digest_size[algo]; return ima_calc_buffer_hash(&file_id, sizeof(file_id) - unused, hash); } /* * xattr_verify - verify xattr digest or signature * * Verify whether the hash or signature matches the file contents. * * Return 0 on success, error code otherwise. */ static int xattr_verify(enum ima_hooks func, struct integrity_iint_cache *iint, struct evm_ima_xattr_data *xattr_value, int xattr_len, enum integrity_status *status, const char **cause) { struct ima_max_digest_data hash; struct signature_v2_hdr *sig; int rc = -EINVAL, hash_start = 0; int mask; switch (xattr_value->type) { case IMA_XATTR_DIGEST_NG: /* first byte contains algorithm id */ hash_start = 1; fallthrough; case IMA_XATTR_DIGEST: if (*status != INTEGRITY_PASS_IMMUTABLE) { if (iint->flags & IMA_DIGSIG_REQUIRED) { if (iint->flags & IMA_VERITY_REQUIRED) *cause = "verity-signature-required"; else *cause = "IMA-signature-required"; *status = INTEGRITY_FAIL; break; } clear_bit(IMA_DIGSIG, &iint->atomic_flags); } else { set_bit(IMA_DIGSIG, &iint->atomic_flags); } if (xattr_len - sizeof(xattr_value->type) - hash_start >= iint->ima_hash->length) /* * xattr length may be longer. md5 hash in previous * version occupied 20 bytes in xattr, instead of 16 */ rc = memcmp(&xattr_value->data[hash_start], iint->ima_hash->digest, iint->ima_hash->length); else rc = -EINVAL; if (rc) { *cause = "invalid-hash"; *status = INTEGRITY_FAIL; break; } *status = INTEGRITY_PASS; break; case EVM_IMA_XATTR_DIGSIG: set_bit(IMA_DIGSIG, &iint->atomic_flags); mask = IMA_DIGSIG_REQUIRED | IMA_VERITY_REQUIRED; if ((iint->flags & mask) == mask) { *cause = "verity-signature-required"; *status = INTEGRITY_FAIL; break; } sig = (typeof(sig))xattr_value; if (sig->version >= 3) { *cause = "invalid-signature-version"; *status = INTEGRITY_FAIL; break; } rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA, (const char *)xattr_value, xattr_len, iint->ima_hash->digest, iint->ima_hash->length); if (rc == -EOPNOTSUPP) { *status = INTEGRITY_UNKNOWN; break; } if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc && func == KEXEC_KERNEL_CHECK) rc = integrity_digsig_verify(INTEGRITY_KEYRING_PLATFORM, (const char *)xattr_value, xattr_len, iint->ima_hash->digest, iint->ima_hash->length); if (rc) { *cause = "invalid-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } break; case IMA_VERITY_DIGSIG: set_bit(IMA_DIGSIG, &iint->atomic_flags); if (iint->flags & IMA_DIGSIG_REQUIRED) { if (!(iint->flags & IMA_VERITY_REQUIRED)) { *cause = "IMA-signature-required"; *status = INTEGRITY_FAIL; break; } } sig = (typeof(sig))xattr_value; if (sig->version != 3) { *cause = "invalid-signature-version"; *status = INTEGRITY_FAIL; break; } rc = calc_file_id_hash(IMA_VERITY_DIGSIG, iint->ima_hash->algo, iint->ima_hash->digest, &hash.hdr); if (rc) { *cause = "sigv3-hashing-error"; *status = INTEGRITY_FAIL; break; } rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA, (const char *)xattr_value, xattr_len, hash.digest, hash.hdr.length); if (rc) { *cause = "invalid-verity-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } break; default: *status = INTEGRITY_UNKNOWN; *cause = "unknown-ima-data"; break; } return rc; } /* * modsig_verify - verify modsig signature * * Verify whether the signature matches the file contents. * * Return 0 on success, error code otherwise. */ static int modsig_verify(enum ima_hooks func, const struct modsig *modsig, enum integrity_status *status, const char **cause) { int rc; rc = integrity_modsig_verify(INTEGRITY_KEYRING_IMA, modsig); if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc && func == KEXEC_KERNEL_CHECK) rc = integrity_modsig_verify(INTEGRITY_KEYRING_PLATFORM, modsig); if (rc) { *cause = "invalid-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } return rc; } /* * ima_check_blacklist - determine if the binary is blacklisted. * * Add the hash of the blacklisted binary to the measurement list, based * on policy. * * Returns -EPERM if the hash is blacklisted. */ int ima_check_blacklist(struct integrity_iint_cache *iint, const struct modsig *modsig, int pcr) { enum hash_algo hash_algo; const u8 *digest = NULL; u32 digestsize = 0; int rc = 0; if (!(iint->flags & IMA_CHECK_BLACKLIST)) return 0; if (iint->flags & IMA_MODSIG_ALLOWED && modsig) { ima_get_modsig_digest(modsig, &hash_algo, &digest, &digestsize); rc = is_binary_blacklisted(digest, digestsize); } else if (iint->flags & IMA_DIGSIG_REQUIRED && iint->ima_hash) rc = is_binary_blacklisted(iint->ima_hash->digest, iint->ima_hash->length); if ((rc == -EPERM) && (iint->flags & IMA_MEASURE)) process_buffer_measurement(&nop_mnt_idmap, NULL, digest, digestsize, "blacklisted-hash", NONE, pcr, NULL, false, NULL, 0); return rc; } /* * ima_appraise_measurement - appraise file measurement * * Call evm_verifyxattr() to verify the integrity of 'security.ima'. * Assuming success, compare the xattr hash with the collected measurement. * * Return 0 on success, error code otherwise */ int ima_appraise_measurement(enum ima_hooks func, struct integrity_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig) { static const char op[] = "appraise_data"; const char *cause = "unknown"; struct dentry *dentry = file_dentry(file); struct inode *inode = d_backing_inode(dentry); enum integrity_status status = INTEGRITY_UNKNOWN; int rc = xattr_len; bool try_modsig = iint->flags & IMA_MODSIG_ALLOWED && modsig; /* If not appraising a modsig, we need an xattr. */ if (!(inode->i_opflags & IOP_XATTR) && !try_modsig) return INTEGRITY_UNKNOWN; /* If reading the xattr failed and there's no modsig, error out. */ if (rc <= 0 && !try_modsig) { if (rc && rc != -ENODATA) goto out; if (iint->flags & IMA_DIGSIG_REQUIRED) { if (iint->flags & IMA_VERITY_REQUIRED) cause = "verity-signature-required"; else cause = "IMA-signature-required"; } else { cause = "missing-hash"; } status = INTEGRITY_NOLABEL; if (file->f_mode & FMODE_CREATED) iint->flags |= IMA_NEW_FILE; if ((iint->flags & IMA_NEW_FILE) && (!(iint->flags & IMA_DIGSIG_REQUIRED) || (inode->i_size == 0))) status = INTEGRITY_PASS; goto out; } status = evm_verifyxattr(dentry, XATTR_NAME_IMA, xattr_value, rc < 0 ? 0 : rc, iint); switch (status) { case INTEGRITY_PASS: case INTEGRITY_PASS_IMMUTABLE: case INTEGRITY_UNKNOWN: break; case INTEGRITY_NOXATTRS: /* No EVM protected xattrs. */ /* It's fine not to have xattrs when using a modsig. */ if (try_modsig) break; fallthrough; case INTEGRITY_NOLABEL: /* No security.evm xattr. */ cause = "missing-HMAC"; goto out; case INTEGRITY_FAIL_IMMUTABLE: set_bit(IMA_DIGSIG, &iint->atomic_flags); cause = "invalid-fail-immutable"; goto out; case INTEGRITY_FAIL: /* Invalid HMAC/signature. */ cause = "invalid-HMAC"; goto out; default: WARN_ONCE(true, "Unexpected integrity status %d\n", status); } if (xattr_value) rc = xattr_verify(func, iint, xattr_value, xattr_len, &status, &cause); /* * If we have a modsig and either no imasig or the imasig's key isn't * known, then try verifying the modsig. */ if (try_modsig && (!xattr_value || xattr_value->type == IMA_XATTR_DIGEST_NG || rc == -ENOKEY)) rc = modsig_verify(func, modsig, &status, &cause); out: /* * File signatures on some filesystems can not be properly verified. * When such filesystems are mounted by an untrusted mounter or on a * system not willing to accept such a risk, fail the file signature * verification. */ if ((inode->i_sb->s_iflags & SB_I_IMA_UNVERIFIABLE_SIGNATURE) && ((inode->i_sb->s_iflags & SB_I_UNTRUSTED_MOUNTER) || (iint->flags & IMA_FAIL_UNVERIFIABLE_SIGS))) { status = INTEGRITY_FAIL; cause = "unverifiable-signature"; integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, filename, op, cause, rc, 0); } else if (status != INTEGRITY_PASS) { /* Fix mode, but don't replace file signatures. */ if ((ima_appraise & IMA_APPRAISE_FIX) && !try_modsig && (!xattr_value || xattr_value->type != EVM_IMA_XATTR_DIGSIG)) { if (!ima_fix_xattr(dentry, iint)) status = INTEGRITY_PASS; } /* * Permit new files with file/EVM portable signatures, but * without data. */ if (inode->i_size == 0 && iint->flags & IMA_NEW_FILE && test_bit(IMA_DIGSIG, &iint->atomic_flags)) { status = INTEGRITY_PASS; } integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, filename, op, cause, rc, 0); } else { ima_cache_flags(iint, func); } ima_set_cache_status(iint, func, status); return status; } /* * ima_update_xattr - update 'security.ima' hash value */ void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file) { struct dentry *dentry = file_dentry(file); int rc = 0; /* do not collect and update hash for digital signatures */ if (test_bit(IMA_DIGSIG, &iint->atomic_flags)) return; if ((iint->ima_file_status != INTEGRITY_PASS) && !(iint->flags & IMA_HASH)) return; rc = ima_collect_measurement(iint, file, NULL, 0, ima_hash_algo, NULL); if (rc < 0) return; inode_lock(file_inode(file)); ima_fix_xattr(dentry, iint); inode_unlock(file_inode(file)); } /** * ima_inode_post_setattr - reflect file metadata changes * @idmap: idmap of the mount the inode was found from * @dentry: pointer to the affected dentry * * Changes to a dentry's metadata might result in needing to appraise. * * This function is called from notify_change(), which expects the caller * to lock the inode's i_mutex. */ void ima_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); struct integrity_iint_cache *iint; int action; if (!(ima_policy_flag & IMA_APPRAISE) || !S_ISREG(inode->i_mode) || !(inode->i_opflags & IOP_XATTR)) return; action = ima_must_appraise(idmap, inode, MAY_ACCESS, POST_SETATTR); iint = integrity_iint_find(inode); if (iint) { set_bit(IMA_CHANGE_ATTR, &iint->atomic_flags); if (!action) clear_bit(IMA_UPDATE_XATTR, &iint->atomic_flags); } } /* * ima_protect_xattr - protect 'security.ima' * * Ensure that not just anyone can modify or remove 'security.ima'. */ static int ima_protect_xattr(struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len) { if (strcmp(xattr_name, XATTR_NAME_IMA) == 0) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; return 1; } return 0; } static void ima_reset_appraise_flags(struct inode *inode, int digsig) { struct integrity_iint_cache *iint; if (!(ima_policy_flag & IMA_APPRAISE) || !S_ISREG(inode->i_mode)) return; iint = integrity_iint_find(inode); if (!iint) return; iint->measured_pcrs = 0; set_bit(IMA_CHANGE_XATTR, &iint->atomic_flags); if (digsig) set_bit(IMA_DIGSIG, &iint->atomic_flags); else clear_bit(IMA_DIGSIG, &iint->atomic_flags); } /** * validate_hash_algo() - Block setxattr with unsupported hash algorithms * @dentry: object of the setxattr() * @xattr_value: userland supplied xattr value * @xattr_value_len: length of xattr_value * * The xattr value is mapped to its hash algorithm, and this algorithm * must be built in the kernel for the setxattr to be allowed. * * Emit an audit message when the algorithm is invalid. * * Return: 0 on success, else an error. */ static int validate_hash_algo(struct dentry *dentry, const struct evm_ima_xattr_data *xattr_value, size_t xattr_value_len) { char *path = NULL, *pathbuf = NULL; enum hash_algo xattr_hash_algo; const char *errmsg = "unavailable-hash-algorithm"; unsigned int allowed_hashes; xattr_hash_algo = ima_get_hash_algo(xattr_value, xattr_value_len); allowed_hashes = atomic_read(&ima_setxattr_allowed_hash_algorithms); if (allowed_hashes) { /* success if the algorithm is allowed in the ima policy */ if (allowed_hashes & (1U << xattr_hash_algo)) return 0; /* * We use a different audit message when the hash algorithm * is denied by a policy rule, instead of not being built * in the kernel image */ errmsg = "denied-hash-algorithm"; } else { if (likely(xattr_hash_algo == ima_hash_algo)) return 0; /* allow any xattr using an algorithm built in the kernel */ if (crypto_has_alg(hash_algo_name[xattr_hash_algo], 0, 0)) return 0; } pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathbuf) return -EACCES; path = dentry_path(dentry, pathbuf, PATH_MAX); integrity_audit_msg(AUDIT_INTEGRITY_DATA, d_inode(dentry), path, "set_data", errmsg, -EACCES, 0); kfree(pathbuf); return -EACCES; } int ima_inode_setxattr(struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len) { const struct evm_ima_xattr_data *xvalue = xattr_value; int digsig = 0; int result; int err; result = ima_protect_xattr(dentry, xattr_name, xattr_value, xattr_value_len); if (result == 1) { if (!xattr_value_len || (xvalue->type >= IMA_XATTR_LAST)) return -EINVAL; err = validate_hash_algo(dentry, xvalue, xattr_value_len); if (err) return err; digsig = (xvalue->type == EVM_IMA_XATTR_DIGSIG); } else if (!strcmp(xattr_name, XATTR_NAME_EVM) && xattr_value_len > 0) { digsig = (xvalue->type == EVM_XATTR_PORTABLE_DIGSIG); } if (result == 1 || evm_revalidate_status(xattr_name)) { ima_reset_appraise_flags(d_backing_inode(dentry), digsig); if (result == 1) result = 0; } return result; } int ima_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { if (evm_revalidate_status(acl_name)) ima_reset_appraise_flags(d_backing_inode(dentry), 0); return 0; } int ima_inode_removexattr(struct dentry *dentry, const char *xattr_name) { int result; result = ima_protect_xattr(dentry, xattr_name, NULL, 0); if (result == 1 || evm_revalidate_status(xattr_name)) { ima_reset_appraise_flags(d_backing_inode(dentry), 0); if (result == 1) result = 0; } return result; }
99 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 // SPDX-License-Identifier: MIT #include <linux/moduleparam.h> #include <linux/vmalloc.h> #include <drm/drm_crtc_helper.h> #include <drm/drm_drv.h> #include <drm/drm_fb_helper.h> #include <drm/drm_framebuffer.h> #include <drm/drm_gem.h> #include <drm/drm_print.h> #include <drm/drm_fbdev_generic.h> /* @user: 1=userspace, 0=fbcon */ static int drm_fbdev_generic_fb_open(struct fb_info *info, int user) { struct drm_fb_helper *fb_helper = info->par; /* No need to take a ref for fbcon because it unbinds on unregister */ if (user && !try_module_get(fb_helper->dev->driver->fops->owner)) return -ENODEV; return 0; } static int drm_fbdev_generic_fb_release(struct fb_info *info, int user) { struct drm_fb_helper *fb_helper = info->par; if (user) module_put(fb_helper->dev->driver->fops->owner); return 0; } FB_GEN_DEFAULT_DEFERRED_SYSMEM_OPS(drm_fbdev_generic, drm_fb_helper_damage_range, drm_fb_helper_damage_area); static void drm_fbdev_generic_fb_destroy(struct fb_info *info) { struct drm_fb_helper *fb_helper = info->par; void *shadow = info->screen_buffer; if (!fb_helper->dev) return; fb_deferred_io_cleanup(info); drm_fb_helper_fini(fb_helper); vfree(shadow); drm_client_framebuffer_delete(fb_helper->buffer); drm_client_release(&fb_helper->client); drm_fb_helper_unprepare(fb_helper); kfree(fb_helper); } static const struct fb_ops drm_fbdev_generic_fb_ops = { .owner = THIS_MODULE, .fb_open = drm_fbdev_generic_fb_open, .fb_release = drm_fbdev_generic_fb_release, FB_DEFAULT_DEFERRED_OPS(drm_fbdev_generic), DRM_FB_HELPER_DEFAULT_OPS, .fb_destroy = drm_fbdev_generic_fb_destroy, }; /* * This function uses the client API to create a framebuffer backed by a dumb buffer. */ static int drm_fbdev_generic_helper_fb_probe(struct drm_fb_helper *fb_helper, struct drm_fb_helper_surface_size *sizes) { struct drm_client_dev *client = &fb_helper->client; struct drm_device *dev = fb_helper->dev; struct drm_client_buffer *buffer; struct fb_info *info; size_t screen_size; void *screen_buffer; u32 format; int ret; drm_dbg_kms(dev, "surface width(%d), height(%d) and bpp(%d)\n", sizes->surface_width, sizes->surface_height, sizes->surface_bpp); format = drm_mode_legacy_fb_format(sizes->surface_bpp, sizes->surface_depth); buffer = drm_client_framebuffer_create(client, sizes->surface_width, sizes->surface_height, format); if (IS_ERR(buffer)) return PTR_ERR(buffer); fb_helper->buffer = buffer; fb_helper->fb = buffer->fb; screen_size = buffer->gem->size; screen_buffer = vzalloc(screen_size); if (!screen_buffer) { ret = -ENOMEM; goto err_drm_client_framebuffer_delete; } info = drm_fb_helper_alloc_info(fb_helper); if (IS_ERR(info)) { ret = PTR_ERR(info); goto err_vfree; } drm_fb_helper_fill_info(info, fb_helper, sizes); info->fbops = &drm_fbdev_generic_fb_ops; /* screen */ info->flags |= FBINFO_VIRTFB | FBINFO_READS_FAST; info->screen_buffer = screen_buffer; info->fix.smem_start = page_to_phys(vmalloc_to_page(info->screen_buffer)); info->fix.smem_len = screen_size; /* deferred I/O */ fb_helper->fbdefio.delay = HZ / 20; fb_helper->fbdefio.deferred_io = drm_fb_helper_deferred_io; info->fbdefio = &fb_helper->fbdefio; ret = fb_deferred_io_init(info); if (ret) goto err_drm_fb_helper_release_info; return 0; err_drm_fb_helper_release_info: drm_fb_helper_release_info(fb_helper); err_vfree: vfree(screen_buffer); err_drm_client_framebuffer_delete: fb_helper->fb = NULL; fb_helper->buffer = NULL; drm_client_framebuffer_delete(buffer); return ret; } static void drm_fbdev_generic_damage_blit_real(struct drm_fb_helper *fb_helper, struct drm_clip_rect *clip, struct iosys_map *dst) { struct drm_framebuffer *fb = fb_helper->fb; size_t offset = clip->y1 * fb->pitches[0]; size_t len = clip->x2 - clip->x1; unsigned int y; void *src; switch (drm_format_info_bpp(fb->format, 0)) { case 1: offset += clip->x1 / 8; len = DIV_ROUND_UP(len + clip->x1 % 8, 8); break; case 2: offset += clip->x1 / 4; len = DIV_ROUND_UP(len + clip->x1 % 4, 4); break; case 4: offset += clip->x1 / 2; len = DIV_ROUND_UP(len + clip->x1 % 2, 2); break; default: offset += clip->x1 * fb->format->cpp[0]; len *= fb->format->cpp[0]; break; } src = fb_helper->info->screen_buffer + offset; iosys_map_incr(dst, offset); /* go to first pixel within clip rect */ for (y = clip->y1; y < clip->y2; y++) { iosys_map_memcpy_to(dst, 0, src, len); iosys_map_incr(dst, fb->pitches[0]); src += fb->pitches[0]; } } static int drm_fbdev_generic_damage_blit(struct drm_fb_helper *fb_helper, struct drm_clip_rect *clip) { struct drm_client_buffer *buffer = fb_helper->buffer; struct iosys_map map, dst; int ret; /* * We have to pin the client buffer to its current location while * flushing the shadow buffer. In the general case, concurrent * modesetting operations could try to move the buffer and would * fail. The modeset has to be serialized by acquiring the reservation * object of the underlying BO here. * * For fbdev emulation, we only have to protect against fbdev modeset * operations. Nothing else will involve the client buffer's BO. So it * is sufficient to acquire struct drm_fb_helper.lock here. */ mutex_lock(&fb_helper->lock); ret = drm_client_buffer_vmap(buffer, &map); if (ret) goto out; dst = map; drm_fbdev_generic_damage_blit_real(fb_helper, clip, &dst); drm_client_buffer_vunmap(buffer); out: mutex_unlock(&fb_helper->lock); return ret; } static int drm_fbdev_generic_helper_fb_dirty(struct drm_fb_helper *helper, struct drm_clip_rect *clip) { struct drm_device *dev = helper->dev; int ret; /* Call damage handlers only if necessary */ if (!(clip->x1 < clip->x2 && clip->y1 < clip->y2)) return 0; ret = drm_fbdev_generic_damage_blit(helper, clip); if (drm_WARN_ONCE(dev, ret, "Damage blitter failed: ret=%d\n", ret)) return ret; if (helper->fb->funcs->dirty) { ret = helper->fb->funcs->dirty(helper->fb, NULL, 0, 0, clip, 1); if (drm_WARN_ONCE(dev, ret, "Dirty helper failed: ret=%d\n", ret)) return ret; } return 0; } static const struct drm_fb_helper_funcs drm_fbdev_generic_helper_funcs = { .fb_probe = drm_fbdev_generic_helper_fb_probe, .fb_dirty = drm_fbdev_generic_helper_fb_dirty, }; static void drm_fbdev_generic_client_unregister(struct drm_client_dev *client) { struct drm_fb_helper *fb_helper = drm_fb_helper_from_client(client); if (fb_helper->info) { drm_fb_helper_unregister_info(fb_helper); } else { drm_client_release(&fb_helper->client); drm_fb_helper_unprepare(fb_helper); kfree(fb_helper); } } static int drm_fbdev_generic_client_restore(struct drm_client_dev *client) { drm_fb_helper_lastclose(client->dev); return 0; } static int drm_fbdev_generic_client_hotplug(struct drm_client_dev *client) { struct drm_fb_helper *fb_helper = drm_fb_helper_from_client(client); struct drm_device *dev = client->dev; int ret; if (dev->fb_helper) return drm_fb_helper_hotplug_event(dev->fb_helper); ret = drm_fb_helper_init(dev, fb_helper); if (ret) goto err_drm_err; if (!drm_drv_uses_atomic_modeset(dev)) drm_helper_disable_unused_functions(dev); ret = drm_fb_helper_initial_config(fb_helper); if (ret) goto err_drm_fb_helper_fini; return 0; err_drm_fb_helper_fini: drm_fb_helper_fini(fb_helper); err_drm_err: drm_err(dev, "fbdev: Failed to setup generic emulation (ret=%d)\n", ret); return ret; } static const struct drm_client_funcs drm_fbdev_generic_client_funcs = { .owner = THIS_MODULE, .unregister = drm_fbdev_generic_client_unregister, .restore = drm_fbdev_generic_client_restore, .hotplug = drm_fbdev_generic_client_hotplug, }; /** * drm_fbdev_generic_setup() - Setup generic fbdev emulation * @dev: DRM device * @preferred_bpp: Preferred bits per pixel for the device. * * This function sets up generic fbdev emulation for drivers that supports * dumb buffers with a virtual address and that can be mmap'ed. * drm_fbdev_generic_setup() shall be called after the DRM driver registered * the new DRM device with drm_dev_register(). * * Restore, hotplug events and teardown are all taken care of. Drivers that do * suspend/resume need to call drm_fb_helper_set_suspend_unlocked() themselves. * Simple drivers might use drm_mode_config_helper_suspend(). * * In order to provide fixed mmap-able memory ranges, generic fbdev emulation * uses a shadow buffer in system memory. The implementation blits the shadow * fbdev buffer onto the real buffer in regular intervals. * * This function is safe to call even when there are no connectors present. * Setup will be retried on the next hotplug event. * * The fbdev is destroyed by drm_dev_unregister(). */ void drm_fbdev_generic_setup(struct drm_device *dev, unsigned int preferred_bpp) { struct drm_fb_helper *fb_helper; int ret; drm_WARN(dev, !dev->registered, "Device has not been registered.\n"); drm_WARN(dev, dev->fb_helper, "fb_helper is already set!\n"); fb_helper = kzalloc(sizeof(*fb_helper), GFP_KERNEL); if (!fb_helper) return; drm_fb_helper_prepare(dev, fb_helper, preferred_bpp, &drm_fbdev_generic_helper_funcs); ret = drm_client_init(dev, &fb_helper->client, "fbdev", &drm_fbdev_generic_client_funcs); if (ret) { drm_err(dev, "Failed to register client: %d\n", ret); goto err_drm_client_init; } drm_client_register(&fb_helper->client); return; err_drm_client_init: drm_fb_helper_unprepare(fb_helper); kfree(fb_helper); return; } EXPORT_SYMBOL(drm_fbdev_generic_setup);
2132 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/pm_qos.h> static inline void device_pm_init_common(struct device *dev) { if (!dev->power.early_init) { spin_lock_init(&dev->power.lock); dev->power.qos = NULL; dev->power.early_init = true; } } #ifdef CONFIG_PM static inline void pm_runtime_early_init(struct device *dev) { dev->power.disable_depth = 1; device_pm_init_common(dev); } extern void pm_runtime_init(struct device *dev); extern void pm_runtime_reinit(struct device *dev); extern void pm_runtime_remove(struct device *dev); extern u64 pm_runtime_active_time(struct device *dev); #define WAKE_IRQ_DEDICATED_ALLOCATED BIT(0) #define WAKE_IRQ_DEDICATED_MANAGED BIT(1) #define WAKE_IRQ_DEDICATED_REVERSE BIT(2) #define WAKE_IRQ_DEDICATED_MASK (WAKE_IRQ_DEDICATED_ALLOCATED | \ WAKE_IRQ_DEDICATED_MANAGED | \ WAKE_IRQ_DEDICATED_REVERSE) #define WAKE_IRQ_DEDICATED_ENABLED BIT(3) struct wake_irq { struct device *dev; unsigned int status; int irq; const char *name; }; extern void dev_pm_arm_wake_irq(struct wake_irq *wirq); extern void dev_pm_disarm_wake_irq(struct wake_irq *wirq); extern void dev_pm_enable_wake_irq_check(struct device *dev, bool can_change_status); extern void dev_pm_disable_wake_irq_check(struct device *dev, bool cond_disable); extern void dev_pm_enable_wake_irq_complete(struct device *dev); #ifdef CONFIG_PM_SLEEP extern void device_wakeup_attach_irq(struct device *dev, struct wake_irq *wakeirq); extern void device_wakeup_detach_irq(struct device *dev); extern void device_wakeup_arm_wake_irqs(void); extern void device_wakeup_disarm_wake_irqs(void); #else static inline void device_wakeup_attach_irq(struct device *dev, struct wake_irq *wakeirq) {} static inline void device_wakeup_detach_irq(struct device *dev) { } #endif /* CONFIG_PM_SLEEP */ /* * sysfs.c */ extern int dpm_sysfs_add(struct device *dev); extern void dpm_sysfs_remove(struct device *dev); extern void rpm_sysfs_remove(struct device *dev); extern int wakeup_sysfs_add(struct device *dev); extern void wakeup_sysfs_remove(struct device *dev); extern int pm_qos_sysfs_add_resume_latency(struct device *dev); extern void pm_qos_sysfs_remove_resume_latency(struct device *dev); extern int pm_qos_sysfs_add_flags(struct device *dev); extern void pm_qos_sysfs_remove_flags(struct device *dev); extern int pm_qos_sysfs_add_latency_tolerance(struct device *dev); extern void pm_qos_sysfs_remove_latency_tolerance(struct device *dev); extern int dpm_sysfs_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid); #else /* CONFIG_PM */ static inline void pm_runtime_early_init(struct device *dev) { device_pm_init_common(dev); } static inline void pm_runtime_init(struct device *dev) {} static inline void pm_runtime_reinit(struct device *dev) {} static inline void pm_runtime_remove(struct device *dev) {} static inline int dpm_sysfs_add(struct device *dev) { return 0; } static inline void dpm_sysfs_remove(struct device *dev) {} static inline int dpm_sysfs_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid) { return 0; } #endif #ifdef CONFIG_PM_SLEEP /* kernel/power/main.c */ extern int pm_async_enabled; /* drivers/base/power/main.c */ extern struct list_head dpm_list; /* The active device list */ static inline struct device *to_device(struct list_head *entry) { return container_of(entry, struct device, power.entry); } extern void device_pm_sleep_init(struct device *dev); extern void device_pm_add(struct device *); extern void device_pm_remove(struct device *); extern void device_pm_move_before(struct device *, struct device *); extern void device_pm_move_after(struct device *, struct device *); extern void device_pm_move_last(struct device *); extern void device_pm_check_callbacks(struct device *dev); static inline bool device_pm_initialized(struct device *dev) { return dev->power.in_dpm_list; } /* drivers/base/power/wakeup_stats.c */ extern int wakeup_source_sysfs_add(struct device *parent, struct wakeup_source *ws); extern void wakeup_source_sysfs_remove(struct wakeup_source *ws); extern int pm_wakeup_source_sysfs_add(struct device *parent); #else /* !CONFIG_PM_SLEEP */ static inline void device_pm_sleep_init(struct device *dev) {} static inline void device_pm_add(struct device *dev) {} static inline void device_pm_remove(struct device *dev) { pm_runtime_remove(dev); } static inline void device_pm_move_before(struct device *deva, struct device *devb) {} static inline void device_pm_move_after(struct device *deva, struct device *devb) {} static inline void device_pm_move_last(struct device *dev) {} static inline void device_pm_check_callbacks(struct device *dev) {} static inline bool device_pm_initialized(struct device *dev) { return device_is_registered(dev); } static inline int pm_wakeup_source_sysfs_add(struct device *parent) { return 0; } #endif /* !CONFIG_PM_SLEEP */ static inline void device_pm_init(struct device *dev) { device_pm_init_common(dev); device_pm_sleep_init(dev); pm_runtime_init(dev); }
60 2 59 3 64 1 64 64 2 2 6 6 35 35 9 1 7 2 35 36 14 14 35 14 59 59 59 2 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. */ #include <linux/blkdev.h> #include <linux/slab.h> #include <linux/bitmap.h> #include <linux/buffer_head.h> #include "exfat_raw.h" #include "exfat_fs.h" #if BITS_PER_LONG == 32 #define __le_long __le32 #define lel_to_cpu(A) le32_to_cpu(A) #define cpu_to_lel(A) cpu_to_le32(A) #elif BITS_PER_LONG == 64 #define __le_long __le64 #define lel_to_cpu(A) le64_to_cpu(A) #define cpu_to_lel(A) cpu_to_le64(A) #else #error "BITS_PER_LONG not 32 or 64" #endif /* * Allocation Bitmap Management Functions */ static int exfat_allocate_bitmap(struct super_block *sb, struct exfat_dentry *ep) { struct exfat_sb_info *sbi = EXFAT_SB(sb); long long map_size; unsigned int i, need_map_size; sector_t sector; sbi->map_clu = le32_to_cpu(ep->dentry.bitmap.start_clu); map_size = le64_to_cpu(ep->dentry.bitmap.size); need_map_size = ((EXFAT_DATA_CLUSTER_COUNT(sbi) - 1) / BITS_PER_BYTE) + 1; if (need_map_size != map_size) { exfat_err(sb, "bogus allocation bitmap size(need : %u, cur : %lld)", need_map_size, map_size); /* * Only allowed when bogus allocation * bitmap size is large */ if (need_map_size > map_size) return -EIO; } sbi->map_sectors = ((need_map_size - 1) >> (sb->s_blocksize_bits)) + 1; sbi->vol_amap = kvmalloc_array(sbi->map_sectors, sizeof(struct buffer_head *), GFP_KERNEL); if (!sbi->vol_amap) return -ENOMEM; sector = exfat_cluster_to_sector(sbi, sbi->map_clu); for (i = 0; i < sbi->map_sectors; i++) { sbi->vol_amap[i] = sb_bread(sb, sector + i); if (!sbi->vol_amap[i]) { /* release all buffers and free vol_amap */ int j = 0; while (j < i) brelse(sbi->vol_amap[j++]); kvfree(sbi->vol_amap); sbi->vol_amap = NULL; return -EIO; } } return 0; } int exfat_load_bitmap(struct super_block *sb) { unsigned int i, type; struct exfat_chain clu; struct exfat_sb_info *sbi = EXFAT_SB(sb); exfat_chain_set(&clu, sbi->root_dir, 0, ALLOC_FAT_CHAIN); while (clu.dir != EXFAT_EOF_CLUSTER) { for (i = 0; i < sbi->dentries_per_clu; i++) { struct exfat_dentry *ep; struct buffer_head *bh; ep = exfat_get_dentry(sb, &clu, i, &bh); if (!ep) return -EIO; type = exfat_get_entry_type(ep); if (type == TYPE_UNUSED) break; if (type != TYPE_BITMAP) continue; if (ep->dentry.bitmap.flags == 0x0) { int err; err = exfat_allocate_bitmap(sb, ep); brelse(bh); return err; } brelse(bh); } if (exfat_get_next_cluster(sb, &clu.dir)) return -EIO; } return -EINVAL; } void exfat_free_bitmap(struct exfat_sb_info *sbi) { int i; for (i = 0; i < sbi->map_sectors; i++) __brelse(sbi->vol_amap[i]); kvfree(sbi->vol_amap); } int exfat_set_bitmap(struct inode *inode, unsigned int clu, bool sync) { int i, b; unsigned int ent_idx; struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); if (!is_valid_cluster(sbi, clu)) return -EINVAL; ent_idx = CLUSTER_TO_BITMAP_ENT(clu); i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx); b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx); set_bit_le(b, sbi->vol_amap[i]->b_data); exfat_update_bh(sbi->vol_amap[i], sync); return 0; } void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync) { int i, b; unsigned int ent_idx; struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_mount_options *opts = &sbi->options; if (!is_valid_cluster(sbi, clu)) return; ent_idx = CLUSTER_TO_BITMAP_ENT(clu); i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx); b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx); clear_bit_le(b, sbi->vol_amap[i]->b_data); exfat_update_bh(sbi->vol_amap[i], sync); if (opts->discard) { int ret_discard; ret_discard = sb_issue_discard(sb, exfat_cluster_to_sector(sbi, clu), (1 << sbi->sect_per_clus_bits), GFP_NOFS, 0); if (ret_discard == -EOPNOTSUPP) { exfat_err(sb, "discard not supported by device, disabling"); opts->discard = 0; } } } /* * If the value of "clu" is 0, it means cluster 2 which is the first cluster of * the cluster heap. */ unsigned int exfat_find_free_bitmap(struct super_block *sb, unsigned int clu) { unsigned int i, map_i, map_b, ent_idx; unsigned int clu_base, clu_free; unsigned long clu_bits, clu_mask; struct exfat_sb_info *sbi = EXFAT_SB(sb); __le_long bitval; WARN_ON(clu < EXFAT_FIRST_CLUSTER); ent_idx = ALIGN_DOWN(CLUSTER_TO_BITMAP_ENT(clu), BITS_PER_LONG); clu_base = BITMAP_ENT_TO_CLUSTER(ent_idx); clu_mask = IGNORED_BITS_REMAINED(clu, clu_base); map_i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx); map_b = BITMAP_OFFSET_BYTE_IN_SECTOR(sb, ent_idx); for (i = EXFAT_FIRST_CLUSTER; i < sbi->num_clusters; i += BITS_PER_LONG) { bitval = *(__le_long *)(sbi->vol_amap[map_i]->b_data + map_b); if (clu_mask > 0) { bitval |= cpu_to_lel(clu_mask); clu_mask = 0; } if (lel_to_cpu(bitval) != ULONG_MAX) { clu_bits = lel_to_cpu(bitval); clu_free = clu_base + ffz(clu_bits); if (clu_free < sbi->num_clusters) return clu_free; } clu_base += BITS_PER_LONG; map_b += sizeof(long); if (map_b >= sb->s_blocksize || clu_base >= sbi->num_clusters) { if (++map_i >= sbi->map_sectors) { clu_base = EXFAT_FIRST_CLUSTER; map_i = 0; } map_b = 0; } } return EXFAT_EOF_CLUSTER; } int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count) { struct exfat_sb_info *sbi = EXFAT_SB(sb); unsigned int count = 0; unsigned int i, map_i = 0, map_b = 0; unsigned int total_clus = EXFAT_DATA_CLUSTER_COUNT(sbi); unsigned int last_mask = total_clus & (BITS_PER_LONG - 1); unsigned long *bitmap, clu_bits; total_clus &= ~last_mask; for (i = 0; i < total_clus; i += BITS_PER_LONG) { bitmap = (void *)(sbi->vol_amap[map_i]->b_data + map_b); count += hweight_long(*bitmap); map_b += sizeof(long); if (map_b >= (unsigned int)sb->s_blocksize) { map_i++; map_b = 0; } } if (last_mask) { bitmap = (void *)(sbi->vol_amap[map_i]->b_data + map_b); clu_bits = lel_to_cpu(*(__le_long *)bitmap); count += hweight_long(clu_bits & BITMAP_LAST_WORD_MASK(last_mask)); } *ret_count = count; return 0; } int exfat_trim_fs(struct inode *inode, struct fstrim_range *range) { unsigned int trim_begin, trim_end, count, next_free_clu; u64 clu_start, clu_end, trim_minlen, trimmed_total = 0; struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); int err = 0; clu_start = max_t(u64, range->start >> sbi->cluster_size_bits, EXFAT_FIRST_CLUSTER); clu_end = clu_start + (range->len >> sbi->cluster_size_bits) - 1; trim_minlen = range->minlen >> sbi->cluster_size_bits; if (clu_start >= sbi->num_clusters || range->len < sbi->cluster_size) return -EINVAL; if (clu_end >= sbi->num_clusters) clu_end = sbi->num_clusters - 1; mutex_lock(&sbi->bitmap_lock); trim_begin = trim_end = exfat_find_free_bitmap(sb, clu_start); if (trim_begin == EXFAT_EOF_CLUSTER) goto unlock; next_free_clu = exfat_find_free_bitmap(sb, trim_end + 1); if (next_free_clu == EXFAT_EOF_CLUSTER) goto unlock; do { if (next_free_clu == trim_end + 1) { /* extend trim range for continuous free cluster */ trim_end++; } else { /* trim current range if it's larger than trim_minlen */ count = trim_end - trim_begin + 1; if (count >= trim_minlen) { err = sb_issue_discard(sb, exfat_cluster_to_sector(sbi, trim_begin), count * sbi->sect_per_clus, GFP_NOFS, 0); if (err) goto unlock; trimmed_total += count; } /* set next start point of the free hole */ trim_begin = trim_end = next_free_clu; } if (next_free_clu >= clu_end) break; if (fatal_signal_pending(current)) { err = -ERESTARTSYS; goto unlock; } next_free_clu = exfat_find_free_bitmap(sb, next_free_clu + 1); } while (next_free_clu != EXFAT_EOF_CLUSTER && next_free_clu > trim_end); /* try to trim remainder */ count = trim_end - trim_begin + 1; if (count >= trim_minlen) { err = sb_issue_discard(sb, exfat_cluster_to_sector(sbi, trim_begin), count * sbi->sect_per_clus, GFP_NOFS, 0); if (err) goto unlock; trimmed_total += count; } unlock: mutex_unlock(&sbi->bitmap_lock); range->len = trimmed_total << sbi->cluster_size_bits; return err; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_NSPROXY_H #define _LINUX_NSPROXY_H #include <linux/refcount.h> #include <linux/spinlock.h> #include <linux/sched.h> struct mnt_namespace; struct uts_namespace; struct ipc_namespace; struct pid_namespace; struct cgroup_namespace; struct fs_struct; /* * A structure to contain pointers to all per-process * namespaces - fs (mount), uts, network, sysvipc, etc. * * The pid namespace is an exception -- it's accessed using * task_active_pid_ns. The pid namespace here is the * namespace that children will use. * * 'count' is the number of tasks holding a reference. * The count for each namespace, then, will be the number * of nsproxies pointing to it, not the number of tasks. * * The nsproxy is shared by tasks which share all namespaces. * As soon as a single namespace is cloned or unshared, the * nsproxy is copied. */ struct nsproxy { refcount_t count; struct uts_namespace *uts_ns; struct ipc_namespace *ipc_ns; struct mnt_namespace *mnt_ns; struct pid_namespace *pid_ns_for_children; struct net *net_ns; struct time_namespace *time_ns; struct time_namespace *time_ns_for_children; struct cgroup_namespace *cgroup_ns; }; extern struct nsproxy init_nsproxy; /* * A structure to encompass all bits needed to install * a partial or complete new set of namespaces. * * If a new user namespace is requested cred will * point to a modifiable set of credentials. If a pointer * to a modifiable set is needed nsset_cred() must be * used and tested. */ struct nsset { unsigned flags; struct nsproxy *nsproxy; struct fs_struct *fs; const struct cred *cred; }; static inline struct cred *nsset_cred(struct nsset *set) { if (set->flags & CLONE_NEWUSER) return (struct cred *)set->cred; return NULL; } /* * the namespaces access rules are: * * 1. only current task is allowed to change tsk->nsproxy pointer or * any pointer on the nsproxy itself. Current must hold the task_lock * when changing tsk->nsproxy. * * 2. when accessing (i.e. reading) current task's namespaces - no * precautions should be taken - just dereference the pointers * * 3. the access to other task namespaces is performed like this * task_lock(task); * nsproxy = task->nsproxy; * if (nsproxy != NULL) { * / * * * work with the namespaces here * * e.g. get the reference on one of them * * / * } / * * * NULL task->nsproxy means that this task is * * almost dead (zombie) * * / * task_unlock(task); * */ int copy_namespaces(unsigned long flags, struct task_struct *tsk); void exit_task_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); int exec_task_namespaces(void); void free_nsproxy(struct nsproxy *ns); int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, struct cred *, struct fs_struct *); int __init nsproxy_cache_init(void); static inline void put_nsproxy(struct nsproxy *ns) { if (refcount_dec_and_test(&ns->count)) free_nsproxy(ns); } static inline void get_nsproxy(struct nsproxy *ns) { refcount_inc(&ns->count); } #endif
5 1 18 2 19 6 7 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 // SPDX-License-Identifier: GPL-2.0 #include <linux/buffer_head.h> #include <linux/slab.h> #include "minix.h" enum {DEPTH = 3, DIRECT = 7}; /* Only double indirect */ typedef u16 block_t; /* 16 bit, host order */ static inline unsigned long block_to_cpu(block_t n) { return n; } static inline block_t cpu_to_block(unsigned long n) { return n; } static inline block_t *i_data(struct inode *inode) { return (block_t *)minix_i(inode)->u.i1_data; } static int block_to_path(struct inode * inode, long block, int offsets[DEPTH]) { int n = 0; if (block < 0) { printk("MINIX-fs: block_to_path: block %ld < 0 on dev %pg\n", block, inode->i_sb->s_bdev); return 0; } if ((u64)block * BLOCK_SIZE >= inode->i_sb->s_maxbytes) return 0; if (block < 7) { offsets[n++] = block; } else if ((block -= 7) < 512) { offsets[n++] = 7; offsets[n++] = block; } else { block -= 512; offsets[n++] = 8; offsets[n++] = block>>9; offsets[n++] = block & 511; } return n; } #include "itree_common.c" int V1_minix_get_block(struct inode * inode, long block, struct buffer_head *bh_result, int create) { return get_block(inode, block, bh_result, create); } void V1_minix_truncate(struct inode * inode) { truncate(inode); } unsigned V1_minix_blocks(loff_t size, struct super_block *sb) { return nblocks(size, sb); }
873 861 5 865 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 // SPDX-License-Identifier: GPL-2.0-or-later /* * Crypto API wrapper for the generic SHA256 code from lib/crypto/sha256.c * * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com> */ #include <crypto/internal/hash.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/types.h> #include <crypto/sha2.h> #include <crypto/sha256_base.h> #include <asm/byteorder.h> #include <asm/unaligned.h> const u8 sha224_zero_message_hash[SHA224_DIGEST_SIZE] = { 0xd1, 0x4a, 0x02, 0x8c, 0x2a, 0x3a, 0x2b, 0xc9, 0x47, 0x61, 0x02, 0xbb, 0x28, 0x82, 0x34, 0xc4, 0x15, 0xa2, 0xb0, 0x1f, 0x82, 0x8e, 0xa6, 0x2a, 0xc5, 0xb3, 0xe4, 0x2f }; EXPORT_SYMBOL_GPL(sha224_zero_message_hash); const u8 sha256_zero_message_hash[SHA256_DIGEST_SIZE] = { 0xe3, 0xb0, 0xc4, 0x42, 0x98, 0xfc, 0x1c, 0x14, 0x9a, 0xfb, 0xf4, 0xc8, 0x99, 0x6f, 0xb9, 0x24, 0x27, 0xae, 0x41, 0xe4, 0x64, 0x9b, 0x93, 0x4c, 0xa4, 0x95, 0x99, 0x1b, 0x78, 0x52, 0xb8, 0x55 }; EXPORT_SYMBOL_GPL(sha256_zero_message_hash); int crypto_sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len) { sha256_update(shash_desc_ctx(desc), data, len); return 0; } EXPORT_SYMBOL(crypto_sha256_update); static int crypto_sha256_final(struct shash_desc *desc, u8 *out) { if (crypto_shash_digestsize(desc->tfm) == SHA224_DIGEST_SIZE) sha224_final(shash_desc_ctx(desc), out); else sha256_final(shash_desc_ctx(desc), out); return 0; } int crypto_sha256_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *hash) { sha256_update(shash_desc_ctx(desc), data, len); return crypto_sha256_final(desc, hash); } EXPORT_SYMBOL(crypto_sha256_finup); static struct shash_alg sha256_algs[2] = { { .digestsize = SHA256_DIGEST_SIZE, .init = sha256_base_init, .update = crypto_sha256_update, .final = crypto_sha256_final, .finup = crypto_sha256_finup, .descsize = sizeof(struct sha256_state), .base = { .cra_name = "sha256", .cra_driver_name= "sha256-generic", .cra_priority = 100, .cra_blocksize = SHA256_BLOCK_SIZE, .cra_module = THIS_MODULE, } }, { .digestsize = SHA224_DIGEST_SIZE, .init = sha224_base_init, .update = crypto_sha256_update, .final = crypto_sha256_final, .finup = crypto_sha256_finup, .descsize = sizeof(struct sha256_state), .base = { .cra_name = "sha224", .cra_driver_name= "sha224-generic", .cra_priority = 100, .cra_blocksize = SHA224_BLOCK_SIZE, .cra_module = THIS_MODULE, } } }; static int __init sha256_generic_mod_init(void) { return crypto_register_shashes(sha256_algs, ARRAY_SIZE(sha256_algs)); } static void __exit sha256_generic_mod_fini(void) { crypto_unregister_shashes(sha256_algs, ARRAY_SIZE(sha256_algs)); } subsys_initcall(sha256_generic_mod_init); module_exit(sha256_generic_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA-224 and SHA-256 Secure Hash Algorithm"); MODULE_ALIAS_CRYPTO("sha224"); MODULE_ALIAS_CRYPTO("sha224-generic"); MODULE_ALIAS_CRYPTO("sha256"); MODULE_ALIAS_CRYPTO("sha256-generic");
71 70 5 45 199 199 39 407 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netfilter.h> #include <linux/rhashtable.h> #include <linux/netdevice.h> #include <net/ip.h> #include <net/ip6_route.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> static DEFINE_MUTEX(flowtable_lock); static LIST_HEAD(flowtables); static void flow_offload_fill_dir(struct flow_offload *flow, enum flow_offload_tuple_dir dir) { struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple; struct nf_conntrack_tuple *ctt = &flow->ct->tuplehash[dir].tuple; ft->dir = dir; switch (ctt->src.l3num) { case NFPROTO_IPV4: ft->src_v4 = ctt->src.u3.in; ft->dst_v4 = ctt->dst.u3.in; break; case NFPROTO_IPV6: ft->src_v6 = ctt->src.u3.in6; ft->dst_v6 = ctt->dst.u3.in6; break; } ft->l3proto = ctt->src.l3num; ft->l4proto = ctt->dst.protonum; switch (ctt->dst.protonum) { case IPPROTO_TCP: case IPPROTO_UDP: ft->src_port = ctt->src.u.tcp.port; ft->dst_port = ctt->dst.u.tcp.port; break; } } struct flow_offload *flow_offload_alloc(struct nf_conn *ct) { struct flow_offload *flow; if (unlikely(nf_ct_is_dying(ct))) return NULL; flow = kzalloc(sizeof(*flow), GFP_ATOMIC); if (!flow) return NULL; refcount_inc(&ct->ct_general.use); flow->ct = ct; flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL); flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_REPLY); if (ct->status & IPS_SRC_NAT) __set_bit(NF_FLOW_SNAT, &flow->flags); if (ct->status & IPS_DST_NAT) __set_bit(NF_FLOW_DNAT, &flow->flags); return flow; } EXPORT_SYMBOL_GPL(flow_offload_alloc); static u32 flow_offload_dst_cookie(struct flow_offload_tuple *flow_tuple) { const struct rt6_info *rt; if (flow_tuple->l3proto == NFPROTO_IPV6) { rt = (const struct rt6_info *)flow_tuple->dst_cache; return rt6_get_cookie(rt); } return 0; } static int flow_offload_fill_route(struct flow_offload *flow, const struct nf_flow_route *route, enum flow_offload_tuple_dir dir) { struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple; struct dst_entry *dst = route->tuple[dir].dst; int i, j = 0; switch (flow_tuple->l3proto) { case NFPROTO_IPV4: flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true); break; case NFPROTO_IPV6: flow_tuple->mtu = ip6_dst_mtu_maybe_forward(dst, true); break; } flow_tuple->iifidx = route->tuple[dir].in.ifindex; for (i = route->tuple[dir].in.num_encaps - 1; i >= 0; i--) { flow_tuple->encap[j].id = route->tuple[dir].in.encap[i].id; flow_tuple->encap[j].proto = route->tuple[dir].in.encap[i].proto; if (route->tuple[dir].in.ingress_vlans & BIT(i)) flow_tuple->in_vlan_ingress |= BIT(j); j++; } flow_tuple->encap_num = route->tuple[dir].in.num_encaps; switch (route->tuple[dir].xmit_type) { case FLOW_OFFLOAD_XMIT_DIRECT: memcpy(flow_tuple->out.h_dest, route->tuple[dir].out.h_dest, ETH_ALEN); memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source, ETH_ALEN); flow_tuple->out.ifidx = route->tuple[dir].out.ifindex; flow_tuple->out.hw_ifidx = route->tuple[dir].out.hw_ifindex; break; case FLOW_OFFLOAD_XMIT_XFRM: case FLOW_OFFLOAD_XMIT_NEIGH: flow_tuple->dst_cache = dst; flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple); break; default: WARN_ON_ONCE(1); break; } flow_tuple->xmit_type = route->tuple[dir].xmit_type; return 0; } static void nft_flow_dst_release(struct flow_offload *flow, enum flow_offload_tuple_dir dir) { if (flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH || flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM) dst_release(flow->tuplehash[dir].tuple.dst_cache); } void flow_offload_route_init(struct flow_offload *flow, const struct nf_flow_route *route) { flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL); flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY); flow->type = NF_FLOW_OFFLOAD_ROUTE; } EXPORT_SYMBOL_GPL(flow_offload_route_init); static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) { tcp->seen[0].td_maxwin = 0; tcp->seen[1].td_maxwin = 0; } static void flow_offload_fixup_ct(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); int l4num = nf_ct_protonum(ct); s32 timeout; if (l4num == IPPROTO_TCP) { struct nf_tcp_net *tn = nf_tcp_pernet(net); flow_offload_fixup_tcp(&ct->proto.tcp); timeout = tn->timeouts[ct->proto.tcp.state]; timeout -= tn->offload_timeout; } else if (l4num == IPPROTO_UDP) { struct nf_udp_net *tn = nf_udp_pernet(net); enum udp_conntrack state = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ? UDP_CT_REPLIED : UDP_CT_UNREPLIED; timeout = tn->timeouts[state]; timeout -= tn->offload_timeout; } else { return; } if (timeout < 0) timeout = 0; if (nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout) WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout); } static void flow_offload_route_release(struct flow_offload *flow) { nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL); nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_REPLY); } void flow_offload_free(struct flow_offload *flow) { switch (flow->type) { case NF_FLOW_OFFLOAD_ROUTE: flow_offload_route_release(flow); break; default: break; } nf_ct_put(flow->ct); kfree_rcu(flow, rcu_head); } EXPORT_SYMBOL_GPL(flow_offload_free); static u32 flow_offload_hash(const void *data, u32 len, u32 seed) { const struct flow_offload_tuple *tuple = data; return jhash(tuple, offsetof(struct flow_offload_tuple, __hash), seed); } static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed) { const struct flow_offload_tuple_rhash *tuplehash = data; return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, __hash), seed); } static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct flow_offload_tuple *tuple = arg->key; const struct flow_offload_tuple_rhash *x = ptr; if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, __hash))) return 1; return 0; } static const struct rhashtable_params nf_flow_offload_rhash_params = { .head_offset = offsetof(struct flow_offload_tuple_rhash, node), .hashfn = flow_offload_hash, .obj_hashfn = flow_offload_hash_obj, .obj_cmpfn = flow_offload_hash_cmp, .automatic_shrinking = true, }; unsigned long flow_offload_get_timeout(struct flow_offload *flow) { unsigned long timeout = NF_FLOW_TIMEOUT; struct net *net = nf_ct_net(flow->ct); int l4num = nf_ct_protonum(flow->ct); if (l4num == IPPROTO_TCP) { struct nf_tcp_net *tn = nf_tcp_pernet(net); timeout = tn->offload_timeout; } else if (l4num == IPPROTO_UDP) { struct nf_udp_net *tn = nf_udp_pernet(net); timeout = tn->offload_timeout; } return timeout; } int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) { int err; flow->timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow); err = rhashtable_insert_fast(&flow_table->rhashtable, &flow->tuplehash[0].node, nf_flow_offload_rhash_params); if (err < 0) return err; err = rhashtable_insert_fast(&flow_table->rhashtable, &flow->tuplehash[1].node, nf_flow_offload_rhash_params); if (err < 0) { rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[0].node, nf_flow_offload_rhash_params); return err; } nf_ct_offload_timeout(flow->ct); if (nf_flowtable_hw_offload(flow_table)) { __set_bit(NF_FLOW_HW, &flow->flags); nf_flow_offload_add(flow_table, flow); } return 0; } EXPORT_SYMBOL_GPL(flow_offload_add); void flow_offload_refresh(struct nf_flowtable *flow_table, struct flow_offload *flow, bool force) { u32 timeout; timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow); if (force || timeout - READ_ONCE(flow->timeout) > HZ) WRITE_ONCE(flow->timeout, timeout); else return; if (likely(!nf_flowtable_hw_offload(flow_table))) return; nf_flow_offload_add(flow_table, flow); } EXPORT_SYMBOL_GPL(flow_offload_refresh); static inline bool nf_flow_has_expired(const struct flow_offload *flow) { return nf_flow_timeout_delta(flow->timeout) <= 0; } static void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow) { rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, nf_flow_offload_rhash_params); rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, nf_flow_offload_rhash_params); flow_offload_free(flow); } void flow_offload_teardown(struct flow_offload *flow) { clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); set_bit(NF_FLOW_TEARDOWN, &flow->flags); flow_offload_fixup_ct(flow->ct); } EXPORT_SYMBOL_GPL(flow_offload_teardown); struct flow_offload_tuple_rhash * flow_offload_lookup(struct nf_flowtable *flow_table, struct flow_offload_tuple *tuple) { struct flow_offload_tuple_rhash *tuplehash; struct flow_offload *flow; int dir; tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple, nf_flow_offload_rhash_params); if (!tuplehash) return NULL; dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) return NULL; if (unlikely(nf_ct_is_dying(flow->ct))) return NULL; return tuplehash; } EXPORT_SYMBOL_GPL(flow_offload_lookup); static int nf_flow_table_iterate(struct nf_flowtable *flow_table, void (*iter)(struct nf_flowtable *flowtable, struct flow_offload *flow, void *data), void *data) { struct flow_offload_tuple_rhash *tuplehash; struct rhashtable_iter hti; struct flow_offload *flow; int err = 0; rhashtable_walk_enter(&flow_table->rhashtable, &hti); rhashtable_walk_start(&hti); while ((tuplehash = rhashtable_walk_next(&hti))) { if (IS_ERR(tuplehash)) { if (PTR_ERR(tuplehash) != -EAGAIN) { err = PTR_ERR(tuplehash); break; } continue; } if (tuplehash->tuple.dir) continue; flow = container_of(tuplehash, struct flow_offload, tuplehash[0]); iter(flow_table, flow, data); } rhashtable_walk_stop(&hti); rhashtable_walk_exit(&hti); return err; } static bool nf_flow_custom_gc(struct nf_flowtable *flow_table, const struct flow_offload *flow) { return flow_table->type->gc && flow_table->type->gc(flow); } static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table, struct flow_offload *flow, void *data) { if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) || nf_flow_custom_gc(flow_table, flow)) flow_offload_teardown(flow); if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { if (test_bit(NF_FLOW_HW, &flow->flags)) { if (!test_bit(NF_FLOW_HW_DYING, &flow->flags)) nf_flow_offload_del(flow_table, flow); else if (test_bit(NF_FLOW_HW_DEAD, &flow->flags)) flow_offload_del(flow_table, flow); } else { flow_offload_del(flow_table, flow); } } else if (test_bit(NF_FLOW_HW, &flow->flags)) { nf_flow_offload_stats(flow_table, flow); } } void nf_flow_table_gc_run(struct nf_flowtable *flow_table) { nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, NULL); } static void nf_flow_offload_work_gc(struct work_struct *work) { struct nf_flowtable *flow_table; flow_table = container_of(work, struct nf_flowtable, gc_work.work); nf_flow_table_gc_run(flow_table); queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ); } static void nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff, __be16 port, __be16 new_port) { struct tcphdr *tcph; tcph = (void *)(skb_network_header(skb) + thoff); inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false); } static void nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff, __be16 port, __be16 new_port) { struct udphdr *udph; udph = (void *)(skb_network_header(skb) + thoff); if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace2(&udph->check, skb, port, new_port, false); if (!udph->check) udph->check = CSUM_MANGLED_0; } } static void nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff, u8 protocol, __be16 port, __be16 new_port) { switch (protocol) { case IPPROTO_TCP: nf_flow_nat_port_tcp(skb, thoff, port, new_port); break; case IPPROTO_UDP: nf_flow_nat_port_udp(skb, thoff, port, new_port); break; } } void nf_flow_snat_port(const struct flow_offload *flow, struct sk_buff *skb, unsigned int thoff, u8 protocol, enum flow_offload_tuple_dir dir) { struct flow_ports *hdr; __be16 port, new_port; hdr = (void *)(skb_network_header(skb) + thoff); switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: port = hdr->source; new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port; hdr->source = new_port; break; case FLOW_OFFLOAD_DIR_REPLY: port = hdr->dest; new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port; hdr->dest = new_port; break; } nf_flow_nat_port(skb, thoff, protocol, port, new_port); } EXPORT_SYMBOL_GPL(nf_flow_snat_port); void nf_flow_dnat_port(const struct flow_offload *flow, struct sk_buff *skb, unsigned int thoff, u8 protocol, enum flow_offload_tuple_dir dir) { struct flow_ports *hdr; __be16 port, new_port; hdr = (void *)(skb_network_header(skb) + thoff); switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: port = hdr->dest; new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port; hdr->dest = new_port; break; case FLOW_OFFLOAD_DIR_REPLY: port = hdr->source; new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port; hdr->source = new_port; break; } nf_flow_nat_port(skb, thoff, protocol, port, new_port); } EXPORT_SYMBOL_GPL(nf_flow_dnat_port); int nf_flow_table_init(struct nf_flowtable *flowtable) { int err; INIT_DELAYED_WORK(&flowtable->gc_work, nf_flow_offload_work_gc); flow_block_init(&flowtable->flow_block); init_rwsem(&flowtable->flow_block_lock); err = rhashtable_init(&flowtable->rhashtable, &nf_flow_offload_rhash_params); if (err < 0) return err; queue_delayed_work(system_power_efficient_wq, &flowtable->gc_work, HZ); mutex_lock(&flowtable_lock); list_add(&flowtable->list, &flowtables); mutex_unlock(&flowtable_lock); return 0; } EXPORT_SYMBOL_GPL(nf_flow_table_init); static void nf_flow_table_do_cleanup(struct nf_flowtable *flow_table, struct flow_offload *flow, void *data) { struct net_device *dev = data; if (!dev) { flow_offload_teardown(flow); return; } if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) && (flow->tuplehash[0].tuple.iifidx == dev->ifindex || flow->tuplehash[1].tuple.iifidx == dev->ifindex)) flow_offload_teardown(flow); } void nf_flow_table_gc_cleanup(struct nf_flowtable *flowtable, struct net_device *dev) { nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev); flush_delayed_work(&flowtable->gc_work); nf_flow_table_offload_flush(flowtable); } void nf_flow_table_cleanup(struct net_device *dev) { struct nf_flowtable *flowtable; mutex_lock(&flowtable_lock); list_for_each_entry(flowtable, &flowtables, list) nf_flow_table_gc_cleanup(flowtable, dev); mutex_unlock(&flowtable_lock); } EXPORT_SYMBOL_GPL(nf_flow_table_cleanup); void nf_flow_table_free(struct nf_flowtable *flow_table) { mutex_lock(&flowtable_lock); list_del(&flow_table->list); mutex_unlock(&flowtable_lock); cancel_delayed_work_sync(&flow_table->gc_work); nf_flow_table_offload_flush(flow_table); /* ... no more pending work after this stage ... */ nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); nf_flow_table_gc_run(flow_table); nf_flow_table_offload_flush_cleanup(flow_table); rhashtable_destroy(&flow_table->rhashtable); } EXPORT_SYMBOL_GPL(nf_flow_table_free); static int nf_flow_table_init_net(struct net *net) { net->ft.stat = alloc_percpu(struct nf_flow_table_stat); return net->ft.stat ? 0 : -ENOMEM; } static void nf_flow_table_fini_net(struct net *net) { free_percpu(net->ft.stat); } static int nf_flow_table_pernet_init(struct net *net) { int ret; ret = nf_flow_table_init_net(net); if (ret < 0) return ret; ret = nf_flow_table_init_proc(net); if (ret < 0) goto out_proc; return 0; out_proc: nf_flow_table_fini_net(net); return ret; } static void nf_flow_table_pernet_exit(struct list_head *net_exit_list) { struct net *net; list_for_each_entry(net, net_exit_list, exit_list) { nf_flow_table_fini_proc(net); nf_flow_table_fini_net(net); } } static struct pernet_operations nf_flow_table_net_ops = { .init = nf_flow_table_pernet_init, .exit_batch = nf_flow_table_pernet_exit, }; static int __init nf_flow_table_module_init(void) { int ret; ret = register_pernet_subsys(&nf_flow_table_net_ops); if (ret < 0) return ret; ret = nf_flow_table_offload_init(); if (ret) goto out_offload; return 0; out_offload: unregister_pernet_subsys(&nf_flow_table_net_ops); return ret; } static void __exit nf_flow_table_module_exit(void) { nf_flow_table_offload_exit(); unregister_pernet_subsys(&nf_flow_table_net_ops); } module_init(nf_flow_table_module_init); module_exit(nf_flow_table_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_DESCRIPTION("Netfilter flow table module");
4 1 3 28 5 48 28 29 906 36 870 36 3 23 22 22 22 18 18 22 3 22 2 2 21 22 22 18 18 2 22 22 18 47 48 48 46 48 24 1 39 2 1 1 1 1 1 1 47 47 22 1 22 23 23 2 19 19 23 3 5 23 2 23 5 23 25 1 1 23 1 1 22 1 24 1 7 20 21 21 24 23 22 3 24 2 24 23 24 19 24 24 24 24 24 23 1 24 1 1 1 1 1 23 24 21 24 24 1 19 19 25 25 2 1 1 2 24 6 24 23 3 3 37 53 6 49 48 23 48 17 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 // SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mm.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/mmu_notifier.h> #include <linux/rmap.h> #include <linux/swap.h> #include <linux/mm_inline.h> #include <linux/kthread.h> #include <linux/khugepaged.h> #include <linux/freezer.h> #include <linux/mman.h> #include <linux/hashtable.h> #include <linux/userfaultfd_k.h> #include <linux/page_idle.h> #include <linux/page_table_check.h> #include <linux/rcupdate_wait.h> #include <linux/swapops.h> #include <linux/shmem_fs.h> #include <linux/ksm.h> #include <asm/tlb.h> #include <asm/pgalloc.h> #include "internal.h" #include "mm_slot.h" enum scan_result { SCAN_FAIL, SCAN_SUCCEED, SCAN_PMD_NULL, SCAN_PMD_NONE, SCAN_PMD_MAPPED, SCAN_EXCEED_NONE_PTE, SCAN_EXCEED_SWAP_PTE, SCAN_EXCEED_SHARED_PTE, SCAN_PTE_NON_PRESENT, SCAN_PTE_UFFD_WP, SCAN_PTE_MAPPED_HUGEPAGE, SCAN_PAGE_RO, SCAN_LACK_REFERENCED_PAGE, SCAN_PAGE_NULL, SCAN_SCAN_ABORT, SCAN_PAGE_COUNT, SCAN_PAGE_LRU, SCAN_PAGE_LOCK, SCAN_PAGE_ANON, SCAN_PAGE_COMPOUND, SCAN_ANY_PROCESS, SCAN_VMA_NULL, SCAN_VMA_CHECK, SCAN_ADDRESS_RANGE, SCAN_DEL_PAGE_LRU, SCAN_ALLOC_HUGE_PAGE_FAIL, SCAN_CGROUP_CHARGE_FAIL, SCAN_TRUNCATED, SCAN_PAGE_HAS_PRIVATE, SCAN_STORE_FAILED, SCAN_COPY_MC, SCAN_PAGE_FILLED, }; #define CREATE_TRACE_POINTS #include <trace/events/huge_memory.h> static struct task_struct *khugepaged_thread __read_mostly; static DEFINE_MUTEX(khugepaged_mutex); /* default scan 8*512 pte (or vmas) every 30 second */ static unsigned int khugepaged_pages_to_scan __read_mostly; static unsigned int khugepaged_pages_collapsed; static unsigned int khugepaged_full_scans; static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; /* during fragmentation poll the hugepage allocator once every minute */ static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; static unsigned long khugepaged_sleep_expire; static DEFINE_SPINLOCK(khugepaged_mm_lock); static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); /* * default collapse hugepages if there is at least one pte mapped like * it would have happened if the vma was large enough during page * fault. * * Note that these are only respected if collapse was initiated by khugepaged. */ static unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly; static unsigned int khugepaged_max_ptes_shared __read_mostly; #define MM_SLOTS_HASH_BITS 10 static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct kmem_cache *mm_slot_cache __ro_after_init; struct collapse_control { bool is_khugepaged; /* Num pages scanned per node */ u32 node_load[MAX_NUMNODES]; /* nodemask for allocation fallback */ nodemask_t alloc_nmask; }; /** * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned * @slot: hash lookup from mm to mm_slot */ struct khugepaged_mm_slot { struct mm_slot slot; }; /** * struct khugepaged_scan - cursor for scanning * @mm_head: the head of the mm list to scan * @mm_slot: the current mm_slot we are scanning * @address: the next address inside that to be scanned * * There is only the one khugepaged_scan instance of this cursor structure. */ struct khugepaged_scan { struct list_head mm_head; struct khugepaged_mm_slot *mm_slot; unsigned long address; }; static struct khugepaged_scan khugepaged_scan = { .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), }; #ifdef CONFIG_SYSFS static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs); } static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int msecs; int err; err = kstrtouint(buf, 10, &msecs); if (err) return -EINVAL; khugepaged_scan_sleep_millisecs = msecs; khugepaged_sleep_expire = 0; wake_up_interruptible(&khugepaged_wait); return count; } static struct kobj_attribute scan_sleep_millisecs_attr = __ATTR_RW(scan_sleep_millisecs); static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs); } static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int msecs; int err; err = kstrtouint(buf, 10, &msecs); if (err) return -EINVAL; khugepaged_alloc_sleep_millisecs = msecs; khugepaged_sleep_expire = 0; wake_up_interruptible(&khugepaged_wait); return count; } static struct kobj_attribute alloc_sleep_millisecs_attr = __ATTR_RW(alloc_sleep_millisecs); static ssize_t pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan); } static ssize_t pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int pages; int err; err = kstrtouint(buf, 10, &pages); if (err || !pages) return -EINVAL; khugepaged_pages_to_scan = pages; return count; } static struct kobj_attribute pages_to_scan_attr = __ATTR_RW(pages_to_scan); static ssize_t pages_collapsed_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed); } static struct kobj_attribute pages_collapsed_attr = __ATTR_RO(pages_collapsed); static ssize_t full_scans_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_full_scans); } static struct kobj_attribute full_scans_attr = __ATTR_RO(full_scans); static ssize_t defrag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return single_hugepage_flag_show(kobj, attr, buf, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); } static ssize_t defrag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { return single_hugepage_flag_store(kobj, attr, buf, count, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); } static struct kobj_attribute khugepaged_defrag_attr = __ATTR_RW(defrag); /* * max_ptes_none controls if khugepaged should collapse hugepages over * any unmapped ptes in turn potentially increasing the memory * footprint of the vmas. When max_ptes_none is 0 khugepaged will not * reduce the available free memory in the system as it * runs. Increasing max_ptes_none will instead potentially reduce the * free memory in the system during the khugepaged scan. */ static ssize_t max_ptes_none_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none); } static ssize_t max_ptes_none_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long max_ptes_none; err = kstrtoul(buf, 10, &max_ptes_none); if (err || max_ptes_none > HPAGE_PMD_NR - 1) return -EINVAL; khugepaged_max_ptes_none = max_ptes_none; return count; } static struct kobj_attribute khugepaged_max_ptes_none_attr = __ATTR_RW(max_ptes_none); static ssize_t max_ptes_swap_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap); } static ssize_t max_ptes_swap_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long max_ptes_swap; err = kstrtoul(buf, 10, &max_ptes_swap); if (err || max_ptes_swap > HPAGE_PMD_NR - 1) return -EINVAL; khugepaged_max_ptes_swap = max_ptes_swap; return count; } static struct kobj_attribute khugepaged_max_ptes_swap_attr = __ATTR_RW(max_ptes_swap); static ssize_t max_ptes_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared); } static ssize_t max_ptes_shared_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long max_ptes_shared; err = kstrtoul(buf, 10, &max_ptes_shared); if (err || max_ptes_shared > HPAGE_PMD_NR - 1) return -EINVAL; khugepaged_max_ptes_shared = max_ptes_shared; return count; } static struct kobj_attribute khugepaged_max_ptes_shared_attr = __ATTR_RW(max_ptes_shared); static struct attribute *khugepaged_attr[] = { &khugepaged_defrag_attr.attr, &khugepaged_max_ptes_none_attr.attr, &khugepaged_max_ptes_swap_attr.attr, &khugepaged_max_ptes_shared_attr.attr, &pages_to_scan_attr.attr, &pages_collapsed_attr.attr, &full_scans_attr.attr, &scan_sleep_millisecs_attr.attr, &alloc_sleep_millisecs_attr.attr, NULL, }; struct attribute_group khugepaged_attr_group = { .attrs = khugepaged_attr, .name = "khugepaged", }; #endif /* CONFIG_SYSFS */ int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { switch (advice) { case MADV_HUGEPAGE: #ifdef CONFIG_S390 /* * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 * can't handle this properly after s390_enable_sie, so we simply * ignore the madvise to prevent qemu from causing a SIGSEGV. */ if (mm_has_pgste(vma->vm_mm)) return 0; #endif *vm_flags &= ~VM_NOHUGEPAGE; *vm_flags |= VM_HUGEPAGE; /* * If the vma become good for khugepaged to scan, * register it here without waiting a page fault that * may not happen any time soon. */ khugepaged_enter_vma(vma, *vm_flags); break; case MADV_NOHUGEPAGE: *vm_flags &= ~VM_HUGEPAGE; *vm_flags |= VM_NOHUGEPAGE; /* * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning * this vma even if we leave the mm registered in khugepaged if * it got registered before VM_NOHUGEPAGE was set. */ break; } return 0; } int __init khugepaged_init(void) { mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", sizeof(struct khugepaged_mm_slot), __alignof__(struct khugepaged_mm_slot), 0, NULL); if (!mm_slot_cache) return -ENOMEM; khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2; return 0; } void __init khugepaged_destroy(void) { kmem_cache_destroy(mm_slot_cache); } static inline int hpage_collapse_test_exit(struct mm_struct *mm) { return atomic_read(&mm->mm_users) == 0; } void __khugepaged_enter(struct mm_struct *mm) { struct khugepaged_mm_slot *mm_slot; struct mm_slot *slot; int wakeup; /* __khugepaged_exit() must not run from under us */ VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) return; mm_slot = mm_slot_alloc(mm_slot_cache); if (!mm_slot) return; slot = &mm_slot->slot; spin_lock(&khugepaged_mm_lock); mm_slot_insert(mm_slots_hash, mm, slot); /* * Insert just behind the scanning cursor, to let the area settle * down a little. */ wakeup = list_empty(&khugepaged_scan.mm_head); list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head); spin_unlock(&khugepaged_mm_lock); mmgrab(mm); if (wakeup) wake_up_interruptible(&khugepaged_wait); } void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && hugepage_flags_enabled()) { if (thp_vma_allowable_order(vma, vm_flags, false, false, true, PMD_ORDER)) __khugepaged_enter(vma->vm_mm); } } void __khugepaged_exit(struct mm_struct *mm) { struct khugepaged_mm_slot *mm_slot; struct mm_slot *slot; int free = 0; spin_lock(&khugepaged_mm_lock); slot = mm_slot_lookup(mm_slots_hash, mm); mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { hash_del(&slot->hash); list_del(&slot->mm_node); free = 1; } spin_unlock(&khugepaged_mm_lock); if (free) { clear_bit(MMF_VM_HUGEPAGE, &mm->flags); mm_slot_free(mm_slot_cache, mm_slot); mmdrop(mm); } else if (mm_slot) { /* * This is required to serialize against * hpage_collapse_test_exit() (which is guaranteed to run * under mmap sem read mode). Stop here (after we return all * pagetables will be destroyed) until khugepaged has finished * working on the pagetables under the mmap_lock. */ mmap_write_lock(mm); mmap_write_unlock(mm); } } static void release_pte_folio(struct folio *folio) { node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), -folio_nr_pages(folio)); folio_unlock(folio); folio_putback_lru(folio); } static void release_pte_pages(pte_t *pte, pte_t *_pte, struct list_head *compound_pagelist) { struct folio *folio, *tmp; while (--_pte >= pte) { pte_t pteval = ptep_get(_pte); unsigned long pfn; if (pte_none(pteval)) continue; pfn = pte_pfn(pteval); if (is_zero_pfn(pfn)) continue; folio = pfn_folio(pfn); if (folio_test_large(folio)) continue; release_pte_folio(folio); } list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) { list_del(&folio->lru); release_pte_folio(folio); } } static bool is_refcount_suitable(struct folio *folio) { int expected_refcount; expected_refcount = folio_mapcount(folio); if (folio_test_swapcache(folio)) expected_refcount += folio_nr_pages(folio); return folio_ref_count(folio) == expected_refcount; } static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, pte_t *pte, struct collapse_control *cc, struct list_head *compound_pagelist) { struct page *page = NULL; struct folio *folio = NULL; pte_t *_pte; int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; bool writable = false; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); if (pte_none(pteval) || (pte_present(pteval) && is_zero_pfn(pte_pfn(pteval)))) { ++none_or_zero; if (!userfaultfd_armed(vma) && (!cc->is_khugepaged || none_or_zero <= khugepaged_max_ptes_none)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out; } } if (!pte_present(pteval)) { result = SCAN_PTE_NON_PRESENT; goto out; } if (pte_uffd_wp(pteval)) { result = SCAN_PTE_UFFD_WP; goto out; } page = vm_normal_page(vma, address, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out; } folio = page_folio(page); VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio); if (page_mapcount(page) > 1) { ++shared; if (cc->is_khugepaged && shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out; } } if (folio_test_large(folio)) { struct folio *f; /* * Check if we have dealt with the compound page * already */ list_for_each_entry(f, compound_pagelist, lru) { if (folio == f) goto next; } } /* * We can do it before isolate_lru_page because the * page can't be freed from under us. NOTE: PG_lock * is needed to serialize against split_huge_page * when invoked from the VM. */ if (!folio_trylock(folio)) { result = SCAN_PAGE_LOCK; goto out; } /* * Check if the page has any GUP (or other external) pins. * * The page table that maps the page has been already unlinked * from the page table tree and this process cannot get * an additional pin on the page. * * New pins can come later if the page is shared across fork, * but not from this process. The other process cannot write to * the page, only trigger CoW. */ if (!is_refcount_suitable(folio)) { folio_unlock(folio); result = SCAN_PAGE_COUNT; goto out; } /* * Isolate the page to avoid collapsing an hugepage * currently in use by the VM. */ if (!folio_isolate_lru(folio)) { folio_unlock(folio); result = SCAN_DEL_PAGE_LRU; goto out; } node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); if (folio_test_large(folio)) list_add_tail(&folio->lru, compound_pagelist); next: /* * If collapse was initiated by khugepaged, check that there is * enough young pte to justify collapsing the page */ if (cc->is_khugepaged && (pte_young(pteval) || folio_test_young(folio) || folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, address))) referenced++; if (pte_write(pteval)) writable = true; } if (unlikely(!writable)) { result = SCAN_PAGE_RO; } else if (unlikely(cc->is_khugepaged && !referenced)) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero, referenced, writable, result); return result; } out: release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero, referenced, writable, result); return result; } static void __collapse_huge_page_copy_succeeded(pte_t *pte, struct vm_area_struct *vma, unsigned long address, spinlock_t *ptl, struct list_head *compound_pagelist) { struct folio *src_folio; struct page *src_page; struct page *tmp; pte_t *_pte; pte_t pteval; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { pteval = ptep_get(_pte); if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); if (is_zero_pfn(pte_pfn(pteval))) { /* * ptl mostly unnecessary. */ spin_lock(ptl); ptep_clear(vma->vm_mm, address, _pte); spin_unlock(ptl); ksm_might_unmap_zero_page(vma->vm_mm, pteval); } } else { src_page = pte_page(pteval); src_folio = page_folio(src_page); if (!folio_test_large(src_folio)) release_pte_folio(src_folio); /* * ptl mostly unnecessary, but preempt has to * be disabled to update the per-cpu stats * inside folio_remove_rmap_pte(). */ spin_lock(ptl); ptep_clear(vma->vm_mm, address, _pte); folio_remove_rmap_pte(src_folio, src_page, vma); spin_unlock(ptl); free_page_and_swap_cache(src_page); } } list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) { list_del(&src_page->lru); mod_node_page_state(page_pgdat(src_page), NR_ISOLATED_ANON + page_is_file_lru(src_page), -compound_nr(src_page)); unlock_page(src_page); free_swap_cache(src_page); putback_lru_page(src_page); } } static void __collapse_huge_page_copy_failed(pte_t *pte, pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma, struct list_head *compound_pagelist) { spinlock_t *pmd_ptl; /* * Re-establish the PMD to point to the original page table * entry. Restoring PMD needs to be done prior to releasing * pages. Since pages are still isolated and locked here, * acquiring anon_vma_lock_write is unnecessary. */ pmd_ptl = pmd_lock(vma->vm_mm, pmd); pmd_populate(vma->vm_mm, pmd, pmd_pgtable(orig_pmd)); spin_unlock(pmd_ptl); /* * Release both raw and compound pages isolated * in __collapse_huge_page_isolate. */ release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist); } /* * __collapse_huge_page_copy - attempts to copy memory contents from raw * pages to a hugepage. Cleans up the raw pages if copying succeeds; * otherwise restores the original page table and releases isolated raw pages. * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC. * * @pte: starting of the PTEs to copy from * @page: the new hugepage to copy contents to * @pmd: pointer to the new hugepage's PMD * @orig_pmd: the original raw pages' PMD * @vma: the original raw pages' virtual memory area * @address: starting address to copy * @ptl: lock on raw pages' PTEs * @compound_pagelist: list that stores compound pages */ static int __collapse_huge_page_copy(pte_t *pte, struct page *page, pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma, unsigned long address, spinlock_t *ptl, struct list_head *compound_pagelist) { struct page *src_page; pte_t *_pte; pte_t pteval; unsigned long _address; int result = SCAN_SUCCEED; /* * Copying pages' contents is subject to memory poison at any iteration. */ for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR; _pte++, page++, _address += PAGE_SIZE) { pteval = ptep_get(_pte); if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { clear_user_highpage(page, _address); continue; } src_page = pte_page(pteval); if (copy_mc_user_highpage(page, src_page, _address, vma) > 0) { result = SCAN_COPY_MC; break; } } if (likely(result == SCAN_SUCCEED)) __collapse_huge_page_copy_succeeded(pte, vma, address, ptl, compound_pagelist); else __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma, compound_pagelist); return result; } static void khugepaged_alloc_sleep(void) { DEFINE_WAIT(wait); add_wait_queue(&khugepaged_wait, &wait); __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); remove_wait_queue(&khugepaged_wait, &wait); } struct collapse_control khugepaged_collapse_control = { .is_khugepaged = true, }; static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc) { int i; /* * If node_reclaim_mode is disabled, then no extra effort is made to * allocate memory locally. */ if (!node_reclaim_enabled()) return false; /* If there is a count for this node already, it must be acceptable */ if (cc->node_load[nid]) return false; for (i = 0; i < MAX_NUMNODES; i++) { if (!cc->node_load[i]) continue; if (node_distance(nid, i) > node_reclaim_distance) return true; } return false; } #define khugepaged_defrag() \ (transparent_hugepage_flags & \ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)) /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) { return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT; } #ifdef CONFIG_NUMA static int hpage_collapse_find_target_node(struct collapse_control *cc) { int nid, target_node = 0, max_value = 0; /* find first node with max normal pages hit */ for (nid = 0; nid < MAX_NUMNODES; nid++) if (cc->node_load[nid] > max_value) { max_value = cc->node_load[nid]; target_node = nid; } for_each_online_node(nid) { if (max_value == cc->node_load[nid]) node_set(nid, cc->alloc_nmask); } return target_node; } #else static int hpage_collapse_find_target_node(struct collapse_control *cc) { return 0; } #endif static bool hpage_collapse_alloc_folio(struct folio **folio, gfp_t gfp, int node, nodemask_t *nmask) { *folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, nmask); if (unlikely(!*folio)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); return false; } count_vm_event(THP_COLLAPSE_ALLOC); return true; } /* * If mmap_lock temporarily dropped, revalidate vma * before taking mmap_lock. * Returns enum scan_result value. */ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, bool expect_anon, struct vm_area_struct **vmap, struct collapse_control *cc) { struct vm_area_struct *vma; if (unlikely(hpage_collapse_test_exit(mm))) return SCAN_ANY_PROCESS; *vmap = vma = find_vma(mm, address); if (!vma) return SCAN_VMA_NULL; if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) return SCAN_ADDRESS_RANGE; if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, cc->is_khugepaged, PMD_ORDER)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then * remapped to file after khugepaged reaquired the mmap_lock. * * thp_vma_allowable_order may return true for qualified file * vmas. */ if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap))) return SCAN_PAGE_ANON; return SCAN_SUCCEED; } static int find_pmd_or_thp_or_none(struct mm_struct *mm, unsigned long address, pmd_t **pmd) { pmd_t pmde; *pmd = mm_find_pmd(mm, address); if (!*pmd) return SCAN_PMD_NULL; pmde = pmdp_get_lockless(*pmd); if (pmd_none(pmde)) return SCAN_PMD_NONE; if (!pmd_present(pmde)) return SCAN_PMD_NULL; if (pmd_trans_huge(pmde)) return SCAN_PMD_MAPPED; if (pmd_devmap(pmde)) return SCAN_PMD_NULL; if (pmd_bad(pmde)) return SCAN_PMD_NULL; return SCAN_SUCCEED; } static int check_pmd_still_valid(struct mm_struct *mm, unsigned long address, pmd_t *pmd) { pmd_t *new_pmd; int result = find_pmd_or_thp_or_none(mm, address, &new_pmd); if (result != SCAN_SUCCEED) return result; if (new_pmd != pmd) return SCAN_FAIL; return SCAN_SUCCEED; } /* * Bring missing pages in from swap, to complete THP collapse. * Only done if hpage_collapse_scan_pmd believes it is worthwhile. * * Called and returns without pte mapped or spinlocks held. * Returns result: if not SCAN_SUCCEED, mmap_lock has been released. */ static int __collapse_huge_page_swapin(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, int referenced) { int swapped_in = 0; vm_fault_t ret = 0; unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE); int result; pte_t *pte = NULL; spinlock_t *ptl; for (address = haddr; address < end; address += PAGE_SIZE) { struct vm_fault vmf = { .vma = vma, .address = address, .pgoff = linear_page_index(vma, address), .flags = FAULT_FLAG_ALLOW_RETRY, .pmd = pmd, }; if (!pte++) { pte = pte_offset_map_nolock(mm, pmd, address, &ptl); if (!pte) { mmap_read_unlock(mm); result = SCAN_PMD_NULL; goto out; } } vmf.orig_pte = ptep_get_lockless(pte); if (!is_swap_pte(vmf.orig_pte)) continue; vmf.pte = pte; vmf.ptl = ptl; ret = do_swap_page(&vmf); /* Which unmaps pte (after perhaps re-checking the entry) */ pte = NULL; /* * do_swap_page returns VM_FAULT_RETRY with released mmap_lock. * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because * we do not retry here and swap entry will remain in pagetable * resulting in later failure. */ if (ret & VM_FAULT_RETRY) { /* Likely, but not guaranteed, that page lock failed */ result = SCAN_PAGE_LOCK; goto out; } if (ret & VM_FAULT_ERROR) { mmap_read_unlock(mm); result = SCAN_FAIL; goto out; } swapped_in++; } if (pte) pte_unmap(pte); /* Drain LRU cache to remove extra pin on the swapped in pages */ if (swapped_in) lru_add_drain(); result = SCAN_SUCCEED; out: trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result); return result; } static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, struct collapse_control *cc) { gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : GFP_TRANSHUGE); int node = hpage_collapse_find_target_node(cc); struct folio *folio; if (!hpage_collapse_alloc_folio(&folio, gfp, node, &cc->alloc_nmask)) { *hpage = NULL; return SCAN_ALLOC_HUGE_PAGE_FAIL; } if (unlikely(mem_cgroup_charge(folio, mm, gfp))) { folio_put(folio); *hpage = NULL; return SCAN_CGROUP_CHARGE_FAIL; } count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1); *hpage = folio_page(folio, 0); return SCAN_SUCCEED; } static int collapse_huge_page(struct mm_struct *mm, unsigned long address, int referenced, int unmapped, struct collapse_control *cc) { LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; pte_t *pte; pgtable_t pgtable; struct folio *folio; struct page *hpage; spinlock_t *pmd_ptl, *pte_ptl; int result = SCAN_FAIL; struct vm_area_struct *vma; struct mmu_notifier_range range; VM_BUG_ON(address & ~HPAGE_PMD_MASK); /* * Before allocating the hugepage, release the mmap_lock read lock. * The allocation can take potentially a long time if it involves * sync compaction, and we do not need to hold the mmap_lock during * that. We will recheck the vma after taking it again in write mode. */ mmap_read_unlock(mm); result = alloc_charge_hpage(&hpage, mm, cc); if (result != SCAN_SUCCEED) goto out_nolock; mmap_read_lock(mm); result = hugepage_vma_revalidate(mm, address, true, &vma, cc); if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; } result = find_pmd_or_thp_or_none(mm, address, &pmd); if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; } if (unmapped) { /* * __collapse_huge_page_swapin will return with mmap_lock * released when it fails. So we jump out_nolock directly in * that case. Continuing to collapse causes inconsistency. */ result = __collapse_huge_page_swapin(mm, vma, address, pmd, referenced); if (result != SCAN_SUCCEED) goto out_nolock; } mmap_read_unlock(mm); /* * Prevent all access to pagetables with the exception of * gup_fast later handled by the ptep_clear_flush and the VM * handled by the anon_vma lock + PG_lock. * * UFFDIO_MOVE is prevented to race as well thanks to the * mmap_lock. */ mmap_write_lock(mm); result = hugepage_vma_revalidate(mm, address, true, &vma, cc); if (result != SCAN_SUCCEED) goto out_up_write; /* check if the pmd is still valid */ result = check_pmd_still_valid(mm, address, pmd); if (result != SCAN_SUCCEED) goto out_up_write; vma_start_write(vma); anon_vma_lock_write(vma->anon_vma); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address, address + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ /* * This removes any huge TLB entry from the CPU so we won't allow * huge and small TLB entries for the same virtual address to * avoid the risk of CPU bugs in that area. * * Parallel fast GUP is fine since fast GUP will back off when * it detects PMD is changed. */ _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(&range); tlb_remove_table_sync_one(); pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl); if (pte) { result = __collapse_huge_page_isolate(vma, address, pte, cc, &compound_pagelist); spin_unlock(pte_ptl); } else { result = SCAN_PMD_NULL; } if (unlikely(result != SCAN_SUCCEED)) { if (pte) pte_unmap(pte); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); /* * We can only use set_pmd_at when establishing * hugepmds and never for establishing regular pmds that * points to regular pagetables. Use pmd_populate for that */ pmd_populate(mm, pmd, pmd_pgtable(_pmd)); spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); goto out_up_write; } /* * All pages are isolated and locked so anon_vma rmap * can't run anymore. */ anon_vma_unlock_write(vma->anon_vma); result = __collapse_huge_page_copy(pte, hpage, pmd, _pmd, vma, address, pte_ptl, &compound_pagelist); pte_unmap(pte); if (unlikely(result != SCAN_SUCCEED)) goto out_up_write; folio = page_folio(hpage); /* * The smp_wmb() inside __folio_mark_uptodate() ensures the * copy_huge_page writes become visible before the set_pmd_at() * write. */ __folio_mark_uptodate(folio); pgtable = pmd_pgtable(_pmd); _pmd = mk_huge_pmd(hpage, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); folio_add_new_anon_rmap(folio, vma, address); folio_add_lru_vma(folio, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); spin_unlock(pmd_ptl); hpage = NULL; result = SCAN_SUCCEED; out_up_write: mmap_write_unlock(mm); out_nolock: if (hpage) put_page(hpage); trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); return result; } static int hpage_collapse_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, bool *mmap_locked, struct collapse_control *cc) { pmd_t *pmd; pte_t *pte, *_pte; int result = SCAN_FAIL, referenced = 0; int none_or_zero = 0, shared = 0; struct page *page = NULL; struct folio *folio = NULL; unsigned long _address; spinlock_t *ptl; int node = NUMA_NO_NODE, unmapped = 0; bool writable = false; VM_BUG_ON(address & ~HPAGE_PMD_MASK); result = find_pmd_or_thp_or_none(mm, address, &pmd); if (result != SCAN_SUCCEED) goto out; memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); pte = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte) { result = SCAN_PMD_NULL; goto out; } for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); if (is_swap_pte(pteval)) { ++unmapped; if (!cc->is_khugepaged || unmapped <= khugepaged_max_ptes_swap) { /* * Always be strict with uffd-wp * enabled swap entries. Please see * comment below for pte_uffd_wp(). */ if (pte_swp_uffd_wp_any(pteval)) { result = SCAN_PTE_UFFD_WP; goto out_unmap; } continue; } else { result = SCAN_EXCEED_SWAP_PTE; count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); goto out_unmap; } } if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { ++none_or_zero; if (!userfaultfd_armed(vma) && (!cc->is_khugepaged || none_or_zero <= khugepaged_max_ptes_none)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out_unmap; } } if (pte_uffd_wp(pteval)) { /* * Don't collapse the page if any of the small * PTEs are armed with uffd write protection. * Here we can also mark the new huge pmd as * write protected if any of the small ones is * marked but that could bring unknown * userfault messages that falls outside of * the registered range. So, just be simple. */ result = SCAN_PTE_UFFD_WP; goto out_unmap; } if (pte_write(pteval)) writable = true; page = vm_normal_page(vma, _address, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out_unmap; } if (page_mapcount(page) > 1) { ++shared; if (cc->is_khugepaged && shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out_unmap; } } folio = page_folio(page); /* * Record which node the original page is from and save this * information to cc->node_load[]. * Khugepaged will allocate hugepage from the node has the max * hit record. */ node = folio_nid(folio); if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; goto out_unmap; } cc->node_load[node]++; if (!folio_test_lru(folio)) { result = SCAN_PAGE_LRU; goto out_unmap; } if (folio_test_locked(folio)) { result = SCAN_PAGE_LOCK; goto out_unmap; } if (!folio_test_anon(folio)) { result = SCAN_PAGE_ANON; goto out_unmap; } /* * Check if the page has any GUP (or other external) pins. * * Here the check may be racy: * it may see total_mapcount > refcount in some cases? * But such case is ephemeral we could always retry collapse * later. However it may report false positive if the page * has excessive GUP pins (i.e. 512). Anyway the same check * will be done again later the risk seems low. */ if (!is_refcount_suitable(folio)) { result = SCAN_PAGE_COUNT; goto out_unmap; } /* * If collapse was initiated by khugepaged, check that there is * enough young pte to justify collapsing the page */ if (cc->is_khugepaged && (pte_young(pteval) || folio_test_young(folio) || folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, address))) referenced++; } if (!writable) { result = SCAN_PAGE_RO; } else if (cc->is_khugepaged && (!referenced || (unmapped && referenced < HPAGE_PMD_NR / 2))) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; } out_unmap: pte_unmap_unlock(pte, ptl); if (result == SCAN_SUCCEED) { result = collapse_huge_page(mm, address, referenced, unmapped, cc); /* collapse_huge_page will return with the mmap_lock released */ *mmap_locked = false; } out: trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced, none_or_zero, result, unmapped); return result; } static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) { struct mm_slot *slot = &mm_slot->slot; struct mm_struct *mm = slot->mm; lockdep_assert_held(&khugepaged_mm_lock); if (hpage_collapse_test_exit(mm)) { /* free mm_slot */ hash_del(&slot->hash); list_del(&slot->mm_node); /* * Not strictly needed because the mm exited already. * * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); */ /* khugepaged_mm_lock actually not necessary for the below */ mm_slot_free(mm_slot_cache, mm_slot); mmdrop(mm); } } #ifdef CONFIG_SHMEM /* hpage must be locked, and mmap_lock must be held */ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, struct page *hpage) { struct vm_fault vmf = { .vma = vma, .address = addr, .flags = 0, .pmd = pmdp, }; VM_BUG_ON(!PageTransHuge(hpage)); mmap_assert_locked(vma->vm_mm); if (do_set_pmd(&vmf, hpage)) return SCAN_FAIL; get_page(hpage); return SCAN_SUCCEED; } /** * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at * address haddr. * * @mm: process address space where collapse happens * @addr: THP collapse address * @install_pmd: If a huge PMD should be installed * * This function checks whether all the PTEs in the PMD are pointing to the * right THP. If so, retract the page table so the THP can refault in with * as pmd-mapped. Possibly install a huge PMD mapping the THP. */ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, bool install_pmd) { struct mmu_notifier_range range; bool notified = false; unsigned long haddr = addr & HPAGE_PMD_MASK; struct vm_area_struct *vma = vma_lookup(mm, haddr); struct folio *folio; pte_t *start_pte, *pte; pmd_t *pmd, pgt_pmd; spinlock_t *pml = NULL, *ptl; int nr_ptes = 0, result = SCAN_FAIL; int i; mmap_assert_locked(mm); /* First check VMA found, in case page tables are being torn down */ if (!vma || !vma->vm_file || !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) return SCAN_VMA_CHECK; /* Fast check before locking page if already PMD-mapped */ result = find_pmd_or_thp_or_none(mm, haddr, &pmd); if (result == SCAN_PMD_MAPPED) return result; /* * If we are here, we've succeeded in replacing all the native pages * in the page cache with a single hugepage. If a mm were to fault-in * this memory (mapped by a suitably aligned VMA), we'd get the hugepage * and map it by a PMD, regardless of sysfs THP settings. As such, let's * analogously elide sysfs THP settings here. */ if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false, PMD_ORDER)) return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ if (userfaultfd_wp(vma)) return SCAN_PTE_UFFD_WP; folio = filemap_lock_folio(vma->vm_file->f_mapping, linear_page_index(vma, haddr)); if (IS_ERR(folio)) return SCAN_PAGE_NULL; if (folio_order(folio) != HPAGE_PMD_ORDER) { result = SCAN_PAGE_COMPOUND; goto drop_folio; } result = find_pmd_or_thp_or_none(mm, haddr, &pmd); switch (result) { case SCAN_SUCCEED: break; case SCAN_PMD_NONE: /* * All pte entries have been removed and pmd cleared. * Skip all the pte checks and just update the pmd mapping. */ goto maybe_install_pmd; default: goto drop_folio; } result = SCAN_FAIL; start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); if (!start_pte) /* mmap_lock + page lock should prevent this */ goto drop_folio; /* step 1: check all mapped PTEs are to the right huge page */ for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { struct page *page; pte_t ptent = ptep_get(pte); /* empty pte, skip */ if (pte_none(ptent)) continue; /* page swapped out, abort */ if (!pte_present(ptent)) { result = SCAN_PTE_NON_PRESENT; goto abort; } page = vm_normal_page(vma, addr, ptent); if (WARN_ON_ONCE(page && is_zone_device_page(page))) page = NULL; /* * Note that uprobe, debugger, or MAP_PRIVATE may change the * page table, but the new page will not be a subpage of hpage. */ if (folio_page(folio, i) != page) goto abort; } pte_unmap_unlock(start_pte, ptl); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); notified = true; /* * pmd_lock covers a wider range than ptl, and (if split from mm's * page_table_lock) ptl nests inside pml. The less time we hold pml, * the better; but userfaultfd's mfill_atomic_pte() on a private VMA * inserts a valid as-if-COWed PTE without even looking up page cache. * So page lock of folio does not protect from it, so we must not drop * ptl before pgt_pmd is removed, so uffd private needs pml taken now. */ if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED)) pml = pmd_lock(mm, pmd); start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl); if (!start_pte) /* mmap_lock + page lock should prevent this */ goto abort; if (!pml) spin_lock(ptl); else if (ptl != pml) spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); /* step 2: clear page table and adjust rmap */ for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { struct page *page; pte_t ptent = ptep_get(pte); if (pte_none(ptent)) continue; /* * We dropped ptl after the first scan, to do the mmu_notifier: * page lock stops more PTEs of the folio being faulted in, but * does not stop write faults COWing anon copies from existing * PTEs; and does not stop those being swapped out or migrated. */ if (!pte_present(ptent)) { result = SCAN_PTE_NON_PRESENT; goto abort; } page = vm_normal_page(vma, addr, ptent); if (folio_page(folio, i) != page) goto abort; /* * Must clear entry, or a racing truncate may re-remove it. * TLB flush can be left until pmdp_collapse_flush() does it. * PTE dirty? Shmem page is already dirty; file is read-only. */ ptep_clear(mm, addr, pte); folio_remove_rmap_pte(folio, page, vma); nr_ptes++; } pte_unmap(start_pte); if (!pml) spin_unlock(ptl); /* step 3: set proper refcount and mm_counters. */ if (nr_ptes) { folio_ref_sub(folio, nr_ptes); add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes); } /* step 4: remove empty page table */ if (!pml) { pml = pmd_lock(mm, pmd); if (ptl != pml) spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); } pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd); pmdp_get_lockless_sync(); if (ptl != pml) spin_unlock(ptl); spin_unlock(pml); mmu_notifier_invalidate_range_end(&range); mm_dec_nr_ptes(mm); page_table_check_pte_clear_range(mm, haddr, pgt_pmd); pte_free_defer(mm, pmd_pgtable(pgt_pmd)); maybe_install_pmd: /* step 5: install pmd entry */ result = install_pmd ? set_huge_pmd(vma, haddr, pmd, &folio->page) : SCAN_SUCCEED; goto drop_folio; abort: if (nr_ptes) { flush_tlb_mm(mm); folio_ref_sub(folio, nr_ptes); add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes); } if (start_pte) pte_unmap_unlock(start_pte, ptl); if (pml && pml != ptl) spin_unlock(pml); if (notified) mmu_notifier_invalidate_range_end(&range); drop_folio: folio_unlock(folio); folio_put(folio); return result; } static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) { struct vm_area_struct *vma; i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { struct mmu_notifier_range range; struct mm_struct *mm; unsigned long addr; pmd_t *pmd, pgt_pmd; spinlock_t *pml; spinlock_t *ptl; bool skipped_uffd = false; /* * Check vma->anon_vma to exclude MAP_PRIVATE mappings that * got written to. These VMAs are likely not worth removing * page tables from, as PMD-mapping is likely to be split later. */ if (READ_ONCE(vma->anon_vma)) continue; addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); if (addr & ~HPAGE_PMD_MASK || vma->vm_end < addr + HPAGE_PMD_SIZE) continue; mm = vma->vm_mm; if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED) continue; if (hpage_collapse_test_exit(mm)) continue; /* * When a vma is registered with uffd-wp, we cannot recycle * the page table because there may be pte markers installed. * Other vmas can still have the same file mapped hugely, but * skip this one: it will always be mapped in small page size * for uffd-wp registered ranges. */ if (userfaultfd_wp(vma)) continue; /* PTEs were notified when unmapped; but now for the PMD? */ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pml = pmd_lock(mm, pmd); ptl = pte_lockptr(mm, pmd); if (ptl != pml) spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); /* * Huge page lock is still held, so normally the page table * must remain empty; and we have already skipped anon_vma * and userfaultfd_wp() vmas. But since the mmap_lock is not * held, it is still possible for a racing userfaultfd_ioctl() * to have inserted ptes or markers. Now that we hold ptlock, * repeating the anon_vma check protects from one category, * and repeating the userfaultfd_wp() check from another. */ if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) { skipped_uffd = true; } else { pgt_pmd = pmdp_collapse_flush(vma, addr, pmd); pmdp_get_lockless_sync(); } if (ptl != pml) spin_unlock(ptl); spin_unlock(pml); mmu_notifier_invalidate_range_end(&range); if (!skipped_uffd) { mm_dec_nr_ptes(mm); page_table_check_pte_clear_range(mm, addr, pgt_pmd); pte_free_defer(mm, pmd_pgtable(pgt_pmd)); } } i_mmap_unlock_read(mapping); } /** * collapse_file - collapse filemap/tmpfs/shmem pages into huge one. * * @mm: process address space where collapse happens * @addr: virtual collapse start address * @file: file that collapse on * @start: collapse start address * @cc: collapse context and scratchpad * * Basic scheme is simple, details are more complex: * - allocate and lock a new huge page; * - scan page cache, locking old pages * + swap/gup in pages if necessary; * - copy data to new page * - handle shmem holes * + re-validate that holes weren't filled by someone else * + check for userfaultfd * - finalize updates to the page cache; * - if replacing succeeds: * + unlock huge page; * + free old pages; * - if replacing failed; * + unlock old pages * + unlock and free huge page; */ static int collapse_file(struct mm_struct *mm, unsigned long addr, struct file *file, pgoff_t start, struct collapse_control *cc) { struct address_space *mapping = file->f_mapping; struct page *hpage; struct page *page; struct page *tmp; struct folio *folio; pgoff_t index = 0, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); int nr_none = 0, result = SCAN_SUCCEED; bool is_shmem = shmem_file(file); int nr = 0; VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); result = alloc_charge_hpage(&hpage, mm, cc); if (result != SCAN_SUCCEED) goto out; __SetPageLocked(hpage); if (is_shmem) __SetPageSwapBacked(hpage); hpage->index = start; hpage->mapping = mapping; /* * Ensure we have slots for all the pages in the range. This is * almost certainly a no-op because most of the pages must be present */ do { xas_lock_irq(&xas); xas_create_range(&xas); if (!xas_error(&xas)) break; xas_unlock_irq(&xas); if (!xas_nomem(&xas, GFP_KERNEL)) { result = SCAN_FAIL; goto rollback; } } while (1); for (index = start; index < end; index++) { xas_set(&xas, index); page = xas_load(&xas); VM_BUG_ON(index != xas.xa_index); if (is_shmem) { if (!page) { /* * Stop if extent has been truncated or * hole-punched, and is now completely * empty. */ if (index == start) { if (!xas_next_entry(&xas, end - 1)) { result = SCAN_TRUNCATED; goto xa_locked; } } nr_none++; continue; } if (xa_is_value(page) || !PageUptodate(page)) { xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ if (shmem_get_folio(mapping->host, index, &folio, SGP_NOALLOC)) { result = SCAN_FAIL; goto xa_unlocked; } /* drain lru cache to help isolate_lru_page() */ lru_add_drain(); page = folio_file_page(folio, index); } else if (trylock_page(page)) { get_page(page); xas_unlock_irq(&xas); } else { result = SCAN_PAGE_LOCK; goto xa_locked; } } else { /* !is_shmem */ if (!page || xa_is_value(page)) { xas_unlock_irq(&xas); page_cache_sync_readahead(mapping, &file->f_ra, file, index, end - index); /* drain lru cache to help isolate_lru_page() */ lru_add_drain(); page = find_lock_page(mapping, index); if (unlikely(page == NULL)) { result = SCAN_FAIL; goto xa_unlocked; } } else if (PageDirty(page)) { /* * khugepaged only works on read-only fd, * so this page is dirty because it hasn't * been flushed since first write. There * won't be new dirty pages. * * Trigger async flush here and hope the * writeback is done when khugepaged * revisits this page. * * This is a one-off situation. We are not * forcing writeback in loop. */ xas_unlock_irq(&xas); filemap_flush(mapping); result = SCAN_FAIL; goto xa_unlocked; } else if (PageWriteback(page)) { xas_unlock_irq(&xas); result = SCAN_FAIL; goto xa_unlocked; } else if (trylock_page(page)) { get_page(page); xas_unlock_irq(&xas); } else { result = SCAN_PAGE_LOCK; goto xa_locked; } } /* * The page must be locked, so we can drop the i_pages lock * without racing with truncate. */ VM_BUG_ON_PAGE(!PageLocked(page), page); /* make sure the page is up to date */ if (unlikely(!PageUptodate(page))) { result = SCAN_FAIL; goto out_unlock; } /* * If file was truncated then extended, or hole-punched, before * we locked the first page, then a THP might be there already. * This will be discovered on the first iteration. */ if (PageTransCompound(page)) { struct page *head = compound_head(page); result = compound_order(head) == HPAGE_PMD_ORDER && head->index == start /* Maybe PMD-mapped */ ? SCAN_PTE_MAPPED_HUGEPAGE : SCAN_PAGE_COMPOUND; goto out_unlock; } folio = page_folio(page); if (folio_mapping(folio) != mapping) { result = SCAN_TRUNCATED; goto out_unlock; } if (!is_shmem && (folio_test_dirty(folio) || folio_test_writeback(folio))) { /* * khugepaged only works on read-only fd, so this * page is dirty because it hasn't been flushed * since first write. */ result = SCAN_FAIL; goto out_unlock; } if (!folio_isolate_lru(folio)) { result = SCAN_DEL_PAGE_LRU; goto out_unlock; } if (!filemap_release_folio(folio, GFP_KERNEL)) { result = SCAN_PAGE_HAS_PRIVATE; folio_putback_lru(folio); goto out_unlock; } if (folio_mapped(folio)) try_to_unmap(folio, TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); xas_lock_irq(&xas); VM_BUG_ON_PAGE(page != xa_load(xas.xa, index), page); /* * We control three references to the page: * - we hold a pin on it; * - one reference from page cache; * - one from isolate_lru_page; * If those are the only references, then any new usage of the * page will have to fetch it from the page cache. That requires * locking the page to handle truncate, so any new usage will be * blocked until we unlock page after collapse/during rollback. */ if (page_count(page) != 3) { result = SCAN_PAGE_COUNT; xas_unlock_irq(&xas); putback_lru_page(page); goto out_unlock; } /* * Accumulate the pages that are being collapsed. */ list_add_tail(&page->lru, &pagelist); continue; out_unlock: unlock_page(page); put_page(page); goto xa_unlocked; } if (!is_shmem) { filemap_nr_thps_inc(mapping); /* * Paired with smp_mb() in do_dentry_open() to ensure * i_writecount is up to date and the update to nr_thps is * visible. Ensures the page cache will be truncated if the * file is opened writable. */ smp_mb(); if (inode_is_open_for_write(mapping->host)) { result = SCAN_FAIL; filemap_nr_thps_dec(mapping); } } xa_locked: xas_unlock_irq(&xas); xa_unlocked: /* * If collapse is successful, flush must be done now before copying. * If collapse is unsuccessful, does flush actually need to be done? * Do it anyway, to clear the state. */ try_to_unmap_flush(); if (result == SCAN_SUCCEED && nr_none && !shmem_charge(mapping->host, nr_none)) result = SCAN_FAIL; if (result != SCAN_SUCCEED) { nr_none = 0; goto rollback; } /* * The old pages are locked, so they won't change anymore. */ index = start; list_for_each_entry(page, &pagelist, lru) { while (index < page->index) { clear_highpage(hpage + (index % HPAGE_PMD_NR)); index++; } if (copy_mc_highpage(hpage + (page->index % HPAGE_PMD_NR), page) > 0) { result = SCAN_COPY_MC; goto rollback; } index++; } while (index < end) { clear_highpage(hpage + (index % HPAGE_PMD_NR)); index++; } if (nr_none) { struct vm_area_struct *vma; int nr_none_check = 0; i_mmap_lock_read(mapping); xas_lock_irq(&xas); xas_set(&xas, start); for (index = start; index < end; index++) { if (!xas_next(&xas)) { xas_store(&xas, XA_RETRY_ENTRY); if (xas_error(&xas)) { result = SCAN_STORE_FAILED; goto immap_locked; } nr_none_check++; } } if (nr_none != nr_none_check) { result = SCAN_PAGE_FILLED; goto immap_locked; } /* * If userspace observed a missing page in a VMA with a MODE_MISSING * userfaultfd, then it might expect a UFFD_EVENT_PAGEFAULT for that * page. If so, we need to roll back to avoid suppressing such an * event. Since wp/minor userfaultfds don't give userspace any * guarantees that the kernel doesn't fill a missing page with a zero * page, so they don't matter here. * * Any userfaultfds registered after this point will not be able to * observe any missing pages due to the previously inserted retry * entries. */ vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) { if (userfaultfd_missing(vma)) { result = SCAN_EXCEED_NONE_PTE; goto immap_locked; } } immap_locked: i_mmap_unlock_read(mapping); if (result != SCAN_SUCCEED) { xas_set(&xas, start); for (index = start; index < end; index++) { if (xas_next(&xas) == XA_RETRY_ENTRY) xas_store(&xas, NULL); } xas_unlock_irq(&xas); goto rollback; } } else { xas_lock_irq(&xas); } folio = page_folio(hpage); nr = folio_nr_pages(folio); if (is_shmem) __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr); else __lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr); if (nr_none) { __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_none); /* nr_none is always 0 for non-shmem. */ __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_none); } /* * Mark hpage as uptodate before inserting it into the page cache so * that it isn't mistaken for an fallocated but unwritten page. */ folio_mark_uptodate(folio); folio_ref_add(folio, HPAGE_PMD_NR - 1); if (is_shmem) folio_mark_dirty(folio); folio_add_lru(folio); /* Join all the small entries into a single multi-index entry. */ xas_set_order(&xas, start, HPAGE_PMD_ORDER); xas_store(&xas, folio); WARN_ON_ONCE(xas_error(&xas)); xas_unlock_irq(&xas); /* * Remove pte page tables, so we can re-fault the page as huge. * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp(). */ retract_page_tables(mapping, start); if (cc && !cc->is_khugepaged) result = SCAN_PTE_MAPPED_HUGEPAGE; folio_unlock(folio); /* * The collapse has succeeded, so free the old pages. */ list_for_each_entry_safe(page, tmp, &pagelist, lru) { list_del(&page->lru); page->mapping = NULL; ClearPageActive(page); ClearPageUnevictable(page); unlock_page(page); folio_put_refs(page_folio(page), 3); } goto out; rollback: /* Something went wrong: roll back page cache changes */ if (nr_none) { xas_lock_irq(&xas); mapping->nrpages -= nr_none; xas_unlock_irq(&xas); shmem_uncharge(mapping->host, nr_none); } list_for_each_entry_safe(page, tmp, &pagelist, lru) { list_del(&page->lru); unlock_page(page); putback_lru_page(page); put_page(page); } /* * Undo the updates of filemap_nr_thps_inc for non-SHMEM * file only. This undo is not needed unless failure is * due to SCAN_COPY_MC. */ if (!is_shmem && result == SCAN_COPY_MC) { filemap_nr_thps_dec(mapping); /* * Paired with smp_mb() in do_dentry_open() to * ensure the update to nr_thps is visible. */ smp_mb(); } hpage->mapping = NULL; unlock_page(hpage); put_page(hpage); out: VM_BUG_ON(!list_empty(&pagelist)); trace_mm_khugepaged_collapse_file(mm, hpage, index, is_shmem, addr, file, nr, result); return result; } static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, struct file *file, pgoff_t start, struct collapse_control *cc) { struct page *page = NULL; struct address_space *mapping = file->f_mapping; XA_STATE(xas, &mapping->i_pages, start); int present, swap; int node = NUMA_NO_NODE; int result = SCAN_SUCCEED; present = 0; swap = 0; memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); rcu_read_lock(); xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) { if (xas_retry(&xas, page)) continue; if (xa_is_value(page)) { ++swap; if (cc->is_khugepaged && swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); break; } continue; } /* * TODO: khugepaged should compact smaller compound pages * into a PMD sized page */ if (PageTransCompound(page)) { struct page *head = compound_head(page); result = compound_order(head) == HPAGE_PMD_ORDER && head->index == start /* Maybe PMD-mapped */ ? SCAN_PTE_MAPPED_HUGEPAGE : SCAN_PAGE_COMPOUND; /* * For SCAN_PTE_MAPPED_HUGEPAGE, further processing * by the caller won't touch the page cache, and so * it's safe to skip LRU and refcount checks before * returning. */ break; } node = page_to_nid(page); if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; break; } cc->node_load[node]++; if (!PageLRU(page)) { result = SCAN_PAGE_LRU; break; } if (page_count(page) != 1 + page_mapcount(page) + page_has_private(page)) { result = SCAN_PAGE_COUNT; break; } /* * We probably should check if the page is referenced here, but * nobody would transfer pte_young() to PageReferenced() for us. * And rmap walk here is just too costly... */ present++; if (need_resched()) { xas_pause(&xas); cond_resched_rcu(); } } rcu_read_unlock(); if (result == SCAN_SUCCEED) { if (cc->is_khugepaged && present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { result = collapse_file(mm, addr, file, start, cc); } } trace_mm_khugepaged_scan_file(mm, page, file, present, swap, result); return result; } #else static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, struct file *file, pgoff_t start, struct collapse_control *cc) { BUILD_BUG(); } #endif static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, struct collapse_control *cc) __releases(&khugepaged_mm_lock) __acquires(&khugepaged_mm_lock) { struct vma_iterator vmi; struct khugepaged_mm_slot *mm_slot; struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; int progress = 0; VM_BUG_ON(!pages); lockdep_assert_held(&khugepaged_mm_lock); *result = SCAN_FAIL; if (khugepaged_scan.mm_slot) { mm_slot = khugepaged_scan.mm_slot; slot = &mm_slot->slot; } else { slot = list_entry(khugepaged_scan.mm_head.next, struct mm_slot, mm_node); mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); khugepaged_scan.address = 0; khugepaged_scan.mm_slot = mm_slot; } spin_unlock(&khugepaged_mm_lock); mm = slot->mm; /* * Don't wait for semaphore (to avoid long wait times). Just move to * the next mm on the list. */ vma = NULL; if (unlikely(!mmap_read_trylock(mm))) goto breakouterloop_mmap_lock; progress++; if (unlikely(hpage_collapse_test_exit(mm))) goto breakouterloop; vma_iter_init(&vmi, mm, khugepaged_scan.address); for_each_vma(vmi, vma) { unsigned long hstart, hend; cond_resched(); if (unlikely(hpage_collapse_test_exit(mm))) { progress++; break; } if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, true, PMD_ORDER)) { skip: progress++; continue; } hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE); hend = round_down(vma->vm_end, HPAGE_PMD_SIZE); if (khugepaged_scan.address > hend) goto skip; if (khugepaged_scan.address < hstart) khugepaged_scan.address = hstart; VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); while (khugepaged_scan.address < hend) { bool mmap_locked = true; cond_resched(); if (unlikely(hpage_collapse_test_exit(mm))) goto breakouterloop; VM_BUG_ON(khugepaged_scan.address < hstart || khugepaged_scan.address + HPAGE_PMD_SIZE > hend); if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, khugepaged_scan.address); mmap_read_unlock(mm); mmap_locked = false; *result = hpage_collapse_scan_file(mm, khugepaged_scan.address, file, pgoff, cc); fput(file); if (*result == SCAN_PTE_MAPPED_HUGEPAGE) { mmap_read_lock(mm); if (hpage_collapse_test_exit(mm)) goto breakouterloop; *result = collapse_pte_mapped_thp(mm, khugepaged_scan.address, false); if (*result == SCAN_PMD_MAPPED) *result = SCAN_SUCCEED; mmap_read_unlock(mm); } } else { *result = hpage_collapse_scan_pmd(mm, vma, khugepaged_scan.address, &mmap_locked, cc); } if (*result == SCAN_SUCCEED) ++khugepaged_pages_collapsed; /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; progress += HPAGE_PMD_NR; if (!mmap_locked) /* * We released mmap_lock so break loop. Note * that we drop mmap_lock before all hugepage * allocations, so if allocation fails, we are * guaranteed to break here and report the * correct result back to caller. */ goto breakouterloop_mmap_lock; if (progress >= pages) goto breakouterloop; } } breakouterloop: mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */ breakouterloop_mmap_lock: spin_lock(&khugepaged_mm_lock); VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); /* * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm. */ if (hpage_collapse_test_exit(mm) || !vma) { /* * Make sure that if mm_users is reaching zero while * khugepaged runs here, khugepaged_exit will find * mm_slot not pointing to the exiting mm. */ if (slot->mm_node.next != &khugepaged_scan.mm_head) { slot = list_entry(slot->mm_node.next, struct mm_slot, mm_node); khugepaged_scan.mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); khugepaged_scan.address = 0; } else { khugepaged_scan.mm_slot = NULL; khugepaged_full_scans++; } collect_mm_slot(mm_slot); } return progress; } static int khugepaged_has_work(void) { return !list_empty(&khugepaged_scan.mm_head) && hugepage_flags_enabled(); } static int khugepaged_wait_event(void) { return !list_empty(&khugepaged_scan.mm_head) || kthread_should_stop(); } static void khugepaged_do_scan(struct collapse_control *cc) { unsigned int progress = 0, pass_through_head = 0; unsigned int pages = READ_ONCE(khugepaged_pages_to_scan); bool wait = true; int result = SCAN_SUCCEED; lru_add_drain_all(); while (true) { cond_resched(); if (unlikely(kthread_should_stop())) break; spin_lock(&khugepaged_mm_lock); if (!khugepaged_scan.mm_slot) pass_through_head++; if (khugepaged_has_work() && pass_through_head < 2) progress += khugepaged_scan_mm_slot(pages - progress, &result, cc); else progress = pages; spin_unlock(&khugepaged_mm_lock); if (progress >= pages) break; if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) { /* * If fail to allocate the first time, try to sleep for * a while. When hit again, cancel the scan. */ if (!wait) break; wait = false; khugepaged_alloc_sleep(); } } } static bool khugepaged_should_wakeup(void) { return kthread_should_stop() || time_after_eq(jiffies, khugepaged_sleep_expire); } static void khugepaged_wait_work(void) { if (khugepaged_has_work()) { const unsigned long scan_sleep_jiffies = msecs_to_jiffies(khugepaged_scan_sleep_millisecs); if (!scan_sleep_jiffies) return; khugepaged_sleep_expire = jiffies + scan_sleep_jiffies; wait_event_freezable_timeout(khugepaged_wait, khugepaged_should_wakeup(), scan_sleep_jiffies); return; } if (hugepage_flags_enabled()) wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); } static int khugepaged(void *none) { struct khugepaged_mm_slot *mm_slot; set_freezable(); set_user_nice(current, MAX_NICE); while (!kthread_should_stop()) { khugepaged_do_scan(&khugepaged_collapse_control); khugepaged_wait_work(); } spin_lock(&khugepaged_mm_lock); mm_slot = khugepaged_scan.mm_slot; khugepaged_scan.mm_slot = NULL; if (mm_slot) collect_mm_slot(mm_slot); spin_unlock(&khugepaged_mm_lock); return 0; } static void set_recommended_min_free_kbytes(void) { struct zone *zone; int nr_zones = 0; unsigned long recommended_min; if (!hugepage_flags_enabled()) { calculate_min_free_kbytes(); goto update_wmarks; } for_each_populated_zone(zone) { /* * We don't need to worry about fragmentation of * ZONE_MOVABLE since it only has movable pages. */ if (zone_idx(zone) > gfp_zone(GFP_USER)) continue; nr_zones++; } /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ recommended_min = pageblock_nr_pages * nr_zones * 2; /* * Make sure that on average at least two pageblocks are almost free * of another type, one for a migratetype to fall back to and a * second to avoid subsequent fallbacks of other types There are 3 * MIGRATE_TYPES we care about. */ recommended_min += pageblock_nr_pages * nr_zones * MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; /* don't ever allow to reserve more than 5% of the lowmem */ recommended_min = min(recommended_min, (unsigned long) nr_free_buffer_pages() / 20); recommended_min <<= (PAGE_SHIFT-10); if (recommended_min > min_free_kbytes) { if (user_min_free_kbytes >= 0) pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", min_free_kbytes, recommended_min); min_free_kbytes = recommended_min; } update_wmarks: setup_per_zone_wmarks(); } int start_stop_khugepaged(void) { int err = 0; mutex_lock(&khugepaged_mutex); if (hugepage_flags_enabled()) { if (!khugepaged_thread) khugepaged_thread = kthread_run(khugepaged, NULL, "khugepaged"); if (IS_ERR(khugepaged_thread)) { pr_err("khugepaged: kthread_run(khugepaged) failed\n"); err = PTR_ERR(khugepaged_thread); khugepaged_thread = NULL; goto fail; } if (!list_empty(&khugepaged_scan.mm_head)) wake_up_interruptible(&khugepaged_wait); } else if (khugepaged_thread) { kthread_stop(khugepaged_thread); khugepaged_thread = NULL; } set_recommended_min_free_kbytes(); fail: mutex_unlock(&khugepaged_mutex); return err; } void khugepaged_min_free_kbytes_update(void) { mutex_lock(&khugepaged_mutex); if (hugepage_flags_enabled() && khugepaged_thread) set_recommended_min_free_kbytes(); mutex_unlock(&khugepaged_mutex); } bool current_is_khugepaged(void) { return kthread_func(current) == khugepaged; } static int madvise_collapse_errno(enum scan_result r) { /* * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide * actionable feedback to caller, so they may take an appropriate * fallback measure depending on the nature of the failure. */ switch (r) { case SCAN_ALLOC_HUGE_PAGE_FAIL: return -ENOMEM; case SCAN_CGROUP_CHARGE_FAIL: case SCAN_EXCEED_NONE_PTE: return -EBUSY; /* Resource temporary unavailable - trying again might succeed */ case SCAN_PAGE_COUNT: case SCAN_PAGE_LOCK: case SCAN_PAGE_LRU: case SCAN_DEL_PAGE_LRU: case SCAN_PAGE_FILLED: return -EAGAIN; /* * Other: Trying again likely not to succeed / error intrinsic to * specified memory range. khugepaged likely won't be able to collapse * either. */ default: return -EINVAL; } } int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { struct collapse_control *cc; struct mm_struct *mm = vma->vm_mm; unsigned long hstart, hend, addr; int thps = 0, last_fail = SCAN_FAIL; bool mmap_locked = true; BUG_ON(vma->vm_start > start); BUG_ON(vma->vm_end < end); *prev = vma; if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false, PMD_ORDER)) return -EINVAL; cc = kmalloc(sizeof(*cc), GFP_KERNEL); if (!cc) return -ENOMEM; cc->is_khugepaged = false; mmgrab(mm); lru_add_drain_all(); hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = end & HPAGE_PMD_MASK; for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { int result = SCAN_FAIL; if (!mmap_locked) { cond_resched(); mmap_read_lock(mm); mmap_locked = true; result = hugepage_vma_revalidate(mm, addr, false, &vma, cc); if (result != SCAN_SUCCEED) { last_fail = result; goto out_nolock; } hend = min(hend, vma->vm_end & HPAGE_PMD_MASK); } mmap_assert_locked(mm); memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, addr); mmap_read_unlock(mm); mmap_locked = false; result = hpage_collapse_scan_file(mm, addr, file, pgoff, cc); fput(file); } else { result = hpage_collapse_scan_pmd(mm, vma, addr, &mmap_locked, cc); } if (!mmap_locked) *prev = NULL; /* Tell caller we dropped mmap_lock */ handle_result: switch (result) { case SCAN_SUCCEED: case SCAN_PMD_MAPPED: ++thps; break; case SCAN_PTE_MAPPED_HUGEPAGE: BUG_ON(mmap_locked); BUG_ON(*prev); mmap_read_lock(mm); result = collapse_pte_mapped_thp(mm, addr, true); mmap_read_unlock(mm); goto handle_result; /* Whitelisted set of results where continuing OK */ case SCAN_PMD_NULL: case SCAN_PTE_NON_PRESENT: case SCAN_PTE_UFFD_WP: case SCAN_PAGE_RO: case SCAN_LACK_REFERENCED_PAGE: case SCAN_PAGE_NULL: case SCAN_PAGE_COUNT: case SCAN_PAGE_LOCK: case SCAN_PAGE_COMPOUND: case SCAN_PAGE_LRU: case SCAN_DEL_PAGE_LRU: last_fail = result; break; default: last_fail = result; /* Other error, exit */ goto out_maybelock; } } out_maybelock: /* Caller expects us to hold mmap_lock on return */ if (!mmap_locked) mmap_read_lock(mm); out_nolock: mmap_assert_locked(mm); mmdrop(mm); kfree(cc); return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0 : madvise_collapse_errno(last_fail); }
4 4 2 1 1 3 1 1 1184 5 287 2 1 1 11 2 4 4 1 2 1 4 1 1 1 5 4 4 5 4 1 4 14 3 1 1 2 2 1 2 1 1 2 345 405 1670 1671 191 407 418 86 12 12 11 11 1 1 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 // SPDX-License-Identifier: GPL-2.0 /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Generic netlink support functions to configure an SMC-R PNET table * * Copyright IBM Corp. 2016 * * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com> */ #include <linux/module.h> #include <linux/list.h> #include <linux/ctype.h> #include <linux/mutex.h> #include <net/netlink.h> #include <net/genetlink.h> #include <uapi/linux/if.h> #include <uapi/linux/smc.h> #include <rdma/ib_verbs.h> #include <net/netns/generic.h> #include "smc_netns.h" #include "smc_pnet.h" #include "smc_ib.h" #include "smc_ism.h" #include "smc_core.h" static struct net_device *__pnet_find_base_ndev(struct net_device *ndev); static struct net_device *pnet_find_base_ndev(struct net_device *ndev); static const struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { [SMC_PNETID_NAME] = { .type = NLA_NUL_STRING, .len = SMC_MAX_PNETID_LEN }, [SMC_PNETID_ETHNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [SMC_PNETID_IBNAME] = { .type = NLA_NUL_STRING, .len = IB_DEVICE_NAME_MAX - 1 }, [SMC_PNETID_IBPORT] = { .type = NLA_U8 } }; static struct genl_family smc_pnet_nl_family; enum smc_pnet_nametype { SMC_PNET_ETH = 1, SMC_PNET_IB = 2, }; /* pnet entry stored in pnet table */ struct smc_pnetentry { struct list_head list; char pnet_name[SMC_MAX_PNETID_LEN + 1]; enum smc_pnet_nametype type; union { struct { char eth_name[IFNAMSIZ + 1]; struct net_device *ndev; netdevice_tracker dev_tracker; }; struct { char ib_name[IB_DEVICE_NAME_MAX + 1]; u8 ib_port; }; }; }; /* Check if the pnetid is set */ bool smc_pnet_is_pnetid_set(u8 *pnetid) { if (pnetid[0] == 0 || pnetid[0] == _S) return false; return true; } /* Check if two given pnetids match */ static bool smc_pnet_match(u8 *pnetid1, u8 *pnetid2) { int i; for (i = 0; i < SMC_MAX_PNETID_LEN; i++) { if ((pnetid1[i] == 0 || pnetid1[i] == _S) && (pnetid2[i] == 0 || pnetid2[i] == _S)) break; if (pnetid1[i] != pnetid2[i]) return false; } return true; } /* Remove a pnetid from the pnet table. */ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) { struct smc_pnetentry *pnetelem, *tmp_pe; struct smc_pnettable *pnettable; struct smc_ib_device *ibdev; struct smcd_dev *smcd; struct smc_net *sn; int rc = -ENOENT; int ibport; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; /* remove table entry */ mutex_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (!pnet_name || smc_pnet_match(pnetelem->pnet_name, pnet_name)) { list_del(&pnetelem->list); if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev) { netdev_put(pnetelem->ndev, &pnetelem->dev_tracker); pr_warn_ratelimited("smc: net device %s " "erased user defined " "pnetid %.16s\n", pnetelem->eth_name, pnetelem->pnet_name); } kfree(pnetelem); rc = 0; } } mutex_unlock(&pnettable->lock); /* if this is not the initial namespace, stop here */ if (net != &init_net) return rc; /* remove ib devices */ mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { for (ibport = 0; ibport < SMC_MAX_PORTS; ibport++) { if (ibdev->pnetid_by_user[ibport] && (!pnet_name || smc_pnet_match(pnet_name, ibdev->pnetid[ibport]))) { pr_warn_ratelimited("smc: ib device %s ibport " "%d erased user defined " "pnetid %.16s\n", ibdev->ibdev->name, ibport + 1, ibdev->pnetid[ibport]); memset(ibdev->pnetid[ibport], 0, SMC_MAX_PNETID_LEN); ibdev->pnetid_by_user[ibport] = false; rc = 0; } } } mutex_unlock(&smc_ib_devices.mutex); /* remove smcd devices */ mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd, &smcd_dev_list.list, list) { if (smcd->pnetid_by_user && (!pnet_name || smc_pnet_match(pnet_name, smcd->pnetid))) { pr_warn_ratelimited("smc: smcd device %s " "erased user defined pnetid " "%.16s\n", dev_name(smcd->ops->get_dev(smcd)), smcd->pnetid); memset(smcd->pnetid, 0, SMC_MAX_PNETID_LEN); smcd->pnetid_by_user = false; rc = 0; } } mutex_unlock(&smcd_dev_list.mutex); return rc; } /* Add the reference to a given network device to the pnet table. */ static int smc_pnet_add_by_ndev(struct net_device *ndev) { struct smc_pnetentry *pnetelem, *tmp_pe; struct smc_pnettable *pnettable; struct net *net = dev_net(ndev); struct smc_net *sn; int rc = -ENOENT; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; mutex_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (pnetelem->type == SMC_PNET_ETH && !pnetelem->ndev && !strncmp(pnetelem->eth_name, ndev->name, IFNAMSIZ)) { netdev_hold(ndev, &pnetelem->dev_tracker, GFP_ATOMIC); pnetelem->ndev = ndev; rc = 0; pr_warn_ratelimited("smc: adding net device %s with " "user defined pnetid %.16s\n", pnetelem->eth_name, pnetelem->pnet_name); break; } } mutex_unlock(&pnettable->lock); return rc; } /* Remove the reference to a given network device from the pnet table. */ static int smc_pnet_remove_by_ndev(struct net_device *ndev) { struct smc_pnetentry *pnetelem, *tmp_pe; struct smc_pnettable *pnettable; struct net *net = dev_net(ndev); struct smc_net *sn; int rc = -ENOENT; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; mutex_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev == ndev) { netdev_put(pnetelem->ndev, &pnetelem->dev_tracker); pnetelem->ndev = NULL; rc = 0; pr_warn_ratelimited("smc: removing net device %s with " "user defined pnetid %.16s\n", pnetelem->eth_name, pnetelem->pnet_name); break; } } mutex_unlock(&pnettable->lock); return rc; } /* Apply pnetid to ib device when no pnetid is set. */ static bool smc_pnet_apply_ib(struct smc_ib_device *ib_dev, u8 ib_port, char *pnet_name) { bool applied = false; mutex_lock(&smc_ib_devices.mutex); if (!smc_pnet_is_pnetid_set(ib_dev->pnetid[ib_port - 1])) { memcpy(ib_dev->pnetid[ib_port - 1], pnet_name, SMC_MAX_PNETID_LEN); ib_dev->pnetid_by_user[ib_port - 1] = true; applied = true; } mutex_unlock(&smc_ib_devices.mutex); return applied; } /* Apply pnetid to smcd device when no pnetid is set. */ static bool smc_pnet_apply_smcd(struct smcd_dev *smcd_dev, char *pnet_name) { bool applied = false; mutex_lock(&smcd_dev_list.mutex); if (!smc_pnet_is_pnetid_set(smcd_dev->pnetid)) { memcpy(smcd_dev->pnetid, pnet_name, SMC_MAX_PNETID_LEN); smcd_dev->pnetid_by_user = true; applied = true; } mutex_unlock(&smcd_dev_list.mutex); return applied; } /* The limit for pnetid is 16 characters. * Valid characters should be (single-byte character set) a-z, A-Z, 0-9. * Lower case letters are converted to upper case. * Interior blanks should not be used. */ static bool smc_pnetid_valid(const char *pnet_name, char *pnetid) { char *bf = skip_spaces(pnet_name); size_t len = strlen(bf); char *end = bf + len; if (!len) return false; while (--end >= bf && isspace(*end)) ; if (end - bf >= SMC_MAX_PNETID_LEN) return false; while (bf <= end) { if (!isalnum(*bf)) return false; *pnetid++ = islower(*bf) ? toupper(*bf) : *bf; bf++; } *pnetid = '\0'; return true; } /* Find an infiniband device by a given name. The device might not exist. */ static struct smc_ib_device *smc_pnet_find_ib(char *ib_name) { struct smc_ib_device *ibdev; mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { if (!strncmp(ibdev->ibdev->name, ib_name, sizeof(ibdev->ibdev->name)) || (ibdev->ibdev->dev.parent && !strncmp(dev_name(ibdev->ibdev->dev.parent), ib_name, IB_DEVICE_NAME_MAX - 1))) { goto out; } } ibdev = NULL; out: mutex_unlock(&smc_ib_devices.mutex); return ibdev; } /* Find an smcd device by a given name. The device might not exist. */ static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name) { struct smcd_dev *smcd_dev; mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { if (!strncmp(dev_name(smcd_dev->ops->get_dev(smcd_dev)), smcd_name, IB_DEVICE_NAME_MAX - 1)) goto out; } smcd_dev = NULL; out: mutex_unlock(&smcd_dev_list.mutex); return smcd_dev; } static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net, char *eth_name, char *pnet_name) { struct smc_pnetentry *tmp_pe, *new_pe; struct net_device *ndev, *base_ndev; u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; bool new_netdev; int rc; /* check if (base) netdev already has a pnetid. If there is one, we do * not want to add a pnet table entry */ rc = -EEXIST; ndev = dev_get_by_name(net, eth_name); /* dev_hold() */ if (ndev) { base_ndev = pnet_find_base_ndev(ndev); if (!smc_pnetid_by_dev_port(base_ndev->dev.parent, base_ndev->dev_port, ndev_pnetid)) goto out_put; } /* add a new netdev entry to the pnet table if there isn't one */ rc = -ENOMEM; new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL); if (!new_pe) goto out_put; new_pe->type = SMC_PNET_ETH; memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); strncpy(new_pe->eth_name, eth_name, IFNAMSIZ); rc = -EEXIST; new_netdev = true; mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_ETH && !strncmp(tmp_pe->eth_name, eth_name, IFNAMSIZ)) { new_netdev = false; break; } } if (new_netdev) { if (ndev) { new_pe->ndev = ndev; netdev_tracker_alloc(ndev, &new_pe->dev_tracker, GFP_ATOMIC); } list_add_tail(&new_pe->list, &pnettable->pnetlist); mutex_unlock(&pnettable->lock); } else { mutex_unlock(&pnettable->lock); kfree(new_pe); goto out_put; } if (ndev) pr_warn_ratelimited("smc: net device %s " "applied user defined pnetid %.16s\n", new_pe->eth_name, new_pe->pnet_name); return 0; out_put: dev_put(ndev); return rc; } static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, u8 ib_port, char *pnet_name) { struct smc_pnetentry *tmp_pe, *new_pe; struct smc_ib_device *ib_dev; bool smcddev_applied = true; bool ibdev_applied = true; struct smcd_dev *smcd; struct device *dev; bool new_ibdev; /* try to apply the pnetid to active devices */ ib_dev = smc_pnet_find_ib(ib_name); if (ib_dev) { ibdev_applied = smc_pnet_apply_ib(ib_dev, ib_port, pnet_name); if (ibdev_applied) pr_warn_ratelimited("smc: ib device %s ibport %d " "applied user defined pnetid " "%.16s\n", ib_dev->ibdev->name, ib_port, ib_dev->pnetid[ib_port - 1]); } smcd = smc_pnet_find_smcd(ib_name); if (smcd) { smcddev_applied = smc_pnet_apply_smcd(smcd, pnet_name); if (smcddev_applied) { dev = smcd->ops->get_dev(smcd); pr_warn_ratelimited("smc: smcd device %s " "applied user defined pnetid " "%.16s\n", dev_name(dev), smcd->pnetid); } } /* Apply fails when a device has a hardware-defined pnetid set, do not * add a pnet table entry in that case. */ if (!ibdev_applied || !smcddev_applied) return -EEXIST; /* add a new ib entry to the pnet table if there isn't one */ new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL); if (!new_pe) return -ENOMEM; new_pe->type = SMC_PNET_IB; memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); strncpy(new_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX); new_pe->ib_port = ib_port; new_ibdev = true; mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_IB && !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { new_ibdev = false; break; } } if (new_ibdev) { list_add_tail(&new_pe->list, &pnettable->pnetlist); mutex_unlock(&pnettable->lock); } else { mutex_unlock(&pnettable->lock); kfree(new_pe); } return (new_ibdev) ? 0 : -EEXIST; } /* Append a pnetid to the end of the pnet table if not already on this list. */ static int smc_pnet_enter(struct net *net, struct nlattr *tb[]) { char pnet_name[SMC_MAX_PNETID_LEN + 1]; struct smc_pnettable *pnettable; bool new_netdev = false; bool new_ibdev = false; struct smc_net *sn; u8 ibport = 1; char *string; int rc; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; rc = -EINVAL; if (!tb[SMC_PNETID_NAME]) goto error; string = (char *)nla_data(tb[SMC_PNETID_NAME]); if (!smc_pnetid_valid(string, pnet_name)) goto error; if (tb[SMC_PNETID_ETHNAME]) { string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]); rc = smc_pnet_add_eth(pnettable, net, string, pnet_name); if (!rc) new_netdev = true; else if (rc != -EEXIST) goto error; } /* if this is not the initial namespace, stop here */ if (net != &init_net) return new_netdev ? 0 : -EEXIST; rc = -EINVAL; if (tb[SMC_PNETID_IBNAME]) { string = (char *)nla_data(tb[SMC_PNETID_IBNAME]); string = strim(string); if (tb[SMC_PNETID_IBPORT]) { ibport = nla_get_u8(tb[SMC_PNETID_IBPORT]); if (ibport < 1 || ibport > SMC_MAX_PORTS) goto error; } rc = smc_pnet_add_ib(pnettable, string, ibport, pnet_name); if (!rc) new_ibdev = true; else if (rc != -EEXIST) goto error; } return (new_netdev || new_ibdev) ? 0 : -EEXIST; error: return rc; } /* Convert an smc_pnetentry to a netlink attribute sequence */ static int smc_pnet_set_nla(struct sk_buff *msg, struct smc_pnetentry *pnetelem) { if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name)) return -1; if (pnetelem->type == SMC_PNET_ETH) { if (nla_put_string(msg, SMC_PNETID_ETHNAME, pnetelem->eth_name)) return -1; } else { if (nla_put_string(msg, SMC_PNETID_ETHNAME, "n/a")) return -1; } if (pnetelem->type == SMC_PNET_IB) { if (nla_put_string(msg, SMC_PNETID_IBNAME, pnetelem->ib_name) || nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port)) return -1; } else { if (nla_put_string(msg, SMC_PNETID_IBNAME, "n/a") || nla_put_u8(msg, SMC_PNETID_IBPORT, 0xff)) return -1; } return 0; } static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); return smc_pnet_enter(net, info->attrs); } static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); if (!info->attrs[SMC_PNETID_NAME]) return -EINVAL; return smc_pnet_remove_by_pnetid(net, (char *)nla_data(info->attrs[SMC_PNETID_NAME])); } static int smc_pnet_dump_start(struct netlink_callback *cb) { cb->args[0] = 0; return 0; } static int smc_pnet_dumpinfo(struct sk_buff *skb, u32 portid, u32 seq, u32 flags, struct smc_pnetentry *pnetelem) { void *hdr; hdr = genlmsg_put(skb, portid, seq, &smc_pnet_nl_family, flags, SMC_PNETID_GET); if (!hdr) return -ENOMEM; if (smc_pnet_set_nla(skb, pnetelem) < 0) { genlmsg_cancel(skb, hdr); return -EMSGSIZE; } genlmsg_end(skb, hdr); return 0; } static int _smc_pnet_dump(struct net *net, struct sk_buff *skb, u32 portid, u32 seq, u8 *pnetid, int start_idx) { struct smc_pnettable *pnettable; struct smc_pnetentry *pnetelem; struct smc_net *sn; int idx = 0; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; /* dump pnettable entries */ mutex_lock(&pnettable->lock); list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { if (pnetid && !smc_pnet_match(pnetelem->pnet_name, pnetid)) continue; if (idx++ < start_idx) continue; /* if this is not the initial namespace, dump only netdev */ if (net != &init_net && pnetelem->type != SMC_PNET_ETH) continue; if (smc_pnet_dumpinfo(skb, portid, seq, NLM_F_MULTI, pnetelem)) { --idx; break; } } mutex_unlock(&pnettable->lock); return idx; } static int smc_pnet_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); int idx; idx = _smc_pnet_dump(net, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NULL, cb->args[0]); cb->args[0] = idx; return skb->len; } /* Retrieve one PNETID entry */ static int smc_pnet_get(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct sk_buff *msg; void *hdr; if (!info->attrs[SMC_PNETID_NAME]) return -EINVAL; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; _smc_pnet_dump(net, msg, info->snd_portid, info->snd_seq, nla_data(info->attrs[SMC_PNETID_NAME]), 0); /* finish multi part message and send it */ hdr = nlmsg_put(msg, info->snd_portid, info->snd_seq, NLMSG_DONE, 0, NLM_F_MULTI); if (!hdr) { nlmsg_free(msg); return -EMSGSIZE; } return genlmsg_reply(msg, info); } /* Remove and delete all pnetids from pnet table. */ static int smc_pnet_flush(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); smc_pnet_remove_by_pnetid(net, NULL); return 0; } /* SMC_PNETID generic netlink operation definition */ static const struct genl_ops smc_pnet_ops[] = { { .cmd = SMC_PNETID_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, /* can be retrieved by unprivileged users */ .doit = smc_pnet_get, .dumpit = smc_pnet_dump, .start = smc_pnet_dump_start }, { .cmd = SMC_PNETID_ADD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = smc_pnet_add }, { .cmd = SMC_PNETID_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = smc_pnet_del }, { .cmd = SMC_PNETID_FLUSH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = smc_pnet_flush } }; /* SMC_PNETID family definition */ static struct genl_family smc_pnet_nl_family __ro_after_init = { .hdrsize = 0, .name = SMCR_GENL_FAMILY_NAME, .version = SMCR_GENL_FAMILY_VERSION, .maxattr = SMC_PNETID_MAX, .policy = smc_pnet_policy, .netnsok = true, .module = THIS_MODULE, .ops = smc_pnet_ops, .n_ops = ARRAY_SIZE(smc_pnet_ops), .resv_start_op = SMC_PNETID_FLUSH + 1, }; bool smc_pnet_is_ndev_pnetid(struct net *net, u8 *pnetid) { struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnetids_ndev_entry *pe; bool rc = false; read_lock(&sn->pnetids_ndev.lock); list_for_each_entry(pe, &sn->pnetids_ndev.list, list) { if (smc_pnet_match(pnetid, pe->pnetid)) { rc = true; goto unlock; } } unlock: read_unlock(&sn->pnetids_ndev.lock); return rc; } static int smc_pnet_add_pnetid(struct net *net, u8 *pnetid) { struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnetids_ndev_entry *pe, *pi; pe = kzalloc(sizeof(*pe), GFP_KERNEL); if (!pe) return -ENOMEM; write_lock(&sn->pnetids_ndev.lock); list_for_each_entry(pi, &sn->pnetids_ndev.list, list) { if (smc_pnet_match(pnetid, pe->pnetid)) { refcount_inc(&pi->refcnt); kfree(pe); goto unlock; } } refcount_set(&pe->refcnt, 1); memcpy(pe->pnetid, pnetid, SMC_MAX_PNETID_LEN); list_add_tail(&pe->list, &sn->pnetids_ndev.list); unlock: write_unlock(&sn->pnetids_ndev.lock); return 0; } static void smc_pnet_remove_pnetid(struct net *net, u8 *pnetid) { struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnetids_ndev_entry *pe, *pe2; write_lock(&sn->pnetids_ndev.lock); list_for_each_entry_safe(pe, pe2, &sn->pnetids_ndev.list, list) { if (smc_pnet_match(pnetid, pe->pnetid)) { if (refcount_dec_and_test(&pe->refcnt)) { list_del(&pe->list); kfree(pe); } break; } } write_unlock(&sn->pnetids_ndev.lock); } static void smc_pnet_add_base_pnetid(struct net *net, struct net_device *dev, u8 *ndev_pnetid) { struct net_device *base_dev; base_dev = __pnet_find_base_ndev(dev); if (base_dev->flags & IFF_UP && !smc_pnetid_by_dev_port(base_dev->dev.parent, base_dev->dev_port, ndev_pnetid)) { /* add to PNETIDs list */ smc_pnet_add_pnetid(net, ndev_pnetid); } } /* create initial list of netdevice pnetids */ static void smc_pnet_create_pnetids_list(struct net *net) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; struct net_device *dev; rtnl_lock(); for_each_netdev(net, dev) smc_pnet_add_base_pnetid(net, dev, ndev_pnetid); rtnl_unlock(); } /* clean up list of netdevice pnetids */ static void smc_pnet_destroy_pnetids_list(struct net *net) { struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnetids_ndev_entry *pe, *temp_pe; write_lock(&sn->pnetids_ndev.lock); list_for_each_entry_safe(pe, temp_pe, &sn->pnetids_ndev.list, list) { list_del(&pe->list); kfree(pe); } write_unlock(&sn->pnetids_ndev.lock); } static int smc_pnet_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(event_dev); u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; switch (event) { case NETDEV_REBOOT: case NETDEV_UNREGISTER: smc_pnet_remove_by_ndev(event_dev); smc_ib_ndev_change(event_dev, event); return NOTIFY_OK; case NETDEV_REGISTER: smc_pnet_add_by_ndev(event_dev); smc_ib_ndev_change(event_dev, event); return NOTIFY_OK; case NETDEV_UP: smc_pnet_add_base_pnetid(net, event_dev, ndev_pnetid); return NOTIFY_OK; case NETDEV_DOWN: event_dev = __pnet_find_base_ndev(event_dev); if (!smc_pnetid_by_dev_port(event_dev->dev.parent, event_dev->dev_port, ndev_pnetid)) { /* remove from PNETIDs list */ smc_pnet_remove_pnetid(net, ndev_pnetid); } return NOTIFY_OK; default: return NOTIFY_DONE; } } static struct notifier_block smc_netdev_notifier = { .notifier_call = smc_pnet_netdev_event }; /* init network namespace */ int smc_pnet_net_init(struct net *net) { struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnettable *pnettable = &sn->pnettable; struct smc_pnetids_ndev *pnetids_ndev = &sn->pnetids_ndev; INIT_LIST_HEAD(&pnettable->pnetlist); mutex_init(&pnettable->lock); INIT_LIST_HEAD(&pnetids_ndev->list); rwlock_init(&pnetids_ndev->lock); smc_pnet_create_pnetids_list(net); /* disable handshake limitation by default */ net->smc.limit_smc_hs = 0; return 0; } int __init smc_pnet_init(void) { int rc; rc = genl_register_family(&smc_pnet_nl_family); if (rc) return rc; rc = register_netdevice_notifier(&smc_netdev_notifier); if (rc) genl_unregister_family(&smc_pnet_nl_family); return rc; } /* exit network namespace */ void smc_pnet_net_exit(struct net *net) { /* flush pnet table */ smc_pnet_remove_by_pnetid(net, NULL); smc_pnet_destroy_pnetids_list(net); } void smc_pnet_exit(void) { unregister_netdevice_notifier(&smc_netdev_notifier); genl_unregister_family(&smc_pnet_nl_family); } static struct net_device *__pnet_find_base_ndev(struct net_device *ndev) { int i, nest_lvl; ASSERT_RTNL(); nest_lvl = ndev->lower_level; for (i = 0; i < nest_lvl; i++) { struct list_head *lower = &ndev->adj_list.lower; if (list_empty(lower)) break; lower = lower->next; ndev = netdev_lower_get_next(ndev, &lower); } return ndev; } /* Determine one base device for stacked net devices. * If the lower device level contains more than one devices * (for instance with bonding slaves), just the first device * is used to reach a base device. */ static struct net_device *pnet_find_base_ndev(struct net_device *ndev) { rtnl_lock(); ndev = __pnet_find_base_ndev(ndev); rtnl_unlock(); return ndev; } static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, u8 *pnetid) { struct smc_pnettable *pnettable; struct net *net = dev_net(ndev); struct smc_pnetentry *pnetelem; struct smc_net *sn; int rc = -ENOENT; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; mutex_lock(&pnettable->lock); list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { if (pnetelem->type == SMC_PNET_ETH && ndev == pnetelem->ndev) { /* get pnetid of netdev device */ memcpy(pnetid, pnetelem->pnet_name, SMC_MAX_PNETID_LEN); rc = 0; break; } } mutex_unlock(&pnettable->lock); return rc; } static int smc_pnet_determine_gid(struct smc_ib_device *ibdev, int i, struct smc_init_info *ini) { if (!ini->check_smcrv2 && !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->ib_gid, NULL, NULL)) { ini->ib_dev = ibdev; ini->ib_port = i; return 0; } if (ini->check_smcrv2 && !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->smcrv2.ib_gid_v2, NULL, &ini->smcrv2)) { ini->smcrv2.ib_dev_v2 = ibdev; ini->smcrv2.ib_port_v2 = i; return 0; } return -ENODEV; } /* find a roce device for the given pnetid */ static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, struct smc_init_info *ini, struct smc_ib_device *known_dev, struct net *net) { struct smc_ib_device *ibdev; int i; mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { if (ibdev == known_dev || !rdma_dev_access_netns(ibdev->ibdev, net)) continue; for (i = 1; i <= SMC_MAX_PORTS; i++) { if (!rdma_is_port_valid(ibdev->ibdev, i)) continue; if (smc_pnet_match(ibdev->pnetid[i - 1], pnet_id) && smc_ib_port_active(ibdev, i) && !test_bit(i - 1, ibdev->ports_going_away)) { if (!smc_pnet_determine_gid(ibdev, i, ini)) goto out; } } } out: mutex_unlock(&smc_ib_devices.mutex); } /* find alternate roce device with same pnet_id, vlan_id and net namespace */ void smc_pnet_find_alt_roce(struct smc_link_group *lgr, struct smc_init_info *ini, struct smc_ib_device *known_dev) { struct net *net = lgr->net; _smc_pnet_find_roce_by_pnetid(lgr->pnet_id, ini, known_dev, net); } /* if handshake network device belongs to a roce device, return its * IB device and port */ static void smc_pnet_find_rdma_dev(struct net_device *netdev, struct smc_init_info *ini) { struct net *net = dev_net(netdev); struct smc_ib_device *ibdev; mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { struct net_device *ndev; int i; /* check rdma net namespace */ if (!rdma_dev_access_netns(ibdev->ibdev, net)) continue; for (i = 1; i <= SMC_MAX_PORTS; i++) { if (!rdma_is_port_valid(ibdev->ibdev, i)) continue; if (!ibdev->ibdev->ops.get_netdev) continue; ndev = ibdev->ibdev->ops.get_netdev(ibdev->ibdev, i); if (!ndev) continue; dev_put(ndev); if (netdev == ndev && smc_ib_port_active(ibdev, i) && !test_bit(i - 1, ibdev->ports_going_away)) { if (!smc_pnet_determine_gid(ibdev, i, ini)) break; } } } mutex_unlock(&smc_ib_devices.mutex); } /* Determine the corresponding IB device port based on the hardware PNETID. * Searching stops at the first matching active IB device port with vlan_id * configured. * If nothing found, check pnetid table. * If nothing found, try to use handshake device */ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; struct net *net; ndev = pnet_find_base_ndev(ndev); net = dev_net(ndev); if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) { smc_pnet_find_rdma_dev(ndev, ini); return; /* pnetid could not be determined */ } _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL, net); } static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; struct smcd_dev *ismdev; ndev = pnet_find_base_ndev(ndev); if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) return; /* pnetid could not be determined */ mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(ismdev, &smcd_dev_list.list, list) { if (smc_pnet_match(ismdev->pnetid, ndev_pnetid) && !ismdev->going_away && (!ini->ism_peer_gid[0].gid || !smc_ism_cantalk(&ini->ism_peer_gid[0], ini->vlan_id, ismdev))) { ini->ism_dev[0] = ismdev; break; } } mutex_unlock(&smcd_dev_list.mutex); } /* PNET table analysis for a given sock: * determine ib_device and port belonging to used internal TCP socket * ethernet interface. */ void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini) { struct dst_entry *dst = sk_dst_get(sk); if (!dst) goto out; if (!dst->dev) goto out_rel; smc_pnet_find_roce_by_pnetid(dst->dev, ini); out_rel: dst_release(dst); out: return; } void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini) { struct dst_entry *dst = sk_dst_get(sk); ini->ism_dev[0] = NULL; if (!dst) goto out; if (!dst->dev) goto out_rel; smc_pnet_find_ism_by_pnetid(dst->dev, ini); out_rel: dst_release(dst); out: return; } /* Lookup and apply a pnet table entry to the given ib device. */ int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) { char *ib_name = smcibdev->ibdev->name; struct smc_pnettable *pnettable; struct smc_pnetentry *tmp_pe; struct smc_net *sn; int rc = -ENOENT; /* get pnettable for init namespace */ sn = net_generic(&init_net, smc_net_id); pnettable = &sn->pnettable; mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_IB && !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX) && tmp_pe->ib_port == ib_port) { smc_pnet_apply_ib(smcibdev, ib_port, tmp_pe->pnet_name); rc = 0; break; } } mutex_unlock(&pnettable->lock); return rc; } /* Lookup and apply a pnet table entry to the given smcd device. */ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) { const char *ib_name = dev_name(smcddev->ops->get_dev(smcddev)); struct smc_pnettable *pnettable; struct smc_pnetentry *tmp_pe; struct smc_net *sn; int rc = -ENOENT; /* get pnettable for init namespace */ sn = net_generic(&init_net, smc_net_id); pnettable = &sn->pnettable; mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_IB && !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { smc_pnet_apply_smcd(smcddev, tmp_pe->pnet_name); rc = 0; break; } } mutex_unlock(&pnettable->lock); return rc; }
4 6 1 1 5 4 1 4 1 4 1 4 1 5 5 4 1 5 5 4 1 5 5 5 5 121 73 122 3 1 1 1 1 3 2 1 4 1 1 1 2 3 8 8 382 204 56 62 4 3 122 112 2 58 56 43 71 11 94 8 96 6 42 42 42 2 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 // SPDX-License-Identifier: GPL-2.0-only /* * This file contains functions assisting in mapping VFS to 9P2000 * * Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/parser.h> #include <linux/slab.h> #include <linux/seq_file.h> #include <net/9p/9p.h> #include <net/9p/client.h> #include <net/9p/transport.h> #include "v9fs.h" #include "v9fs_vfs.h" #include "cache.h" static DEFINE_SPINLOCK(v9fs_sessionlist_lock); static LIST_HEAD(v9fs_sessionlist); struct kmem_cache *v9fs_inode_cache; /* * Option Parsing (code inspired by NFS code) * NOTE: each transport will parse its own options */ enum { /* Options that take integer arguments */ Opt_debug, Opt_dfltuid, Opt_dfltgid, Opt_afid, /* String options */ Opt_uname, Opt_remotename, Opt_cache, Opt_cachetag, /* Options that take no arguments */ Opt_nodevmap, Opt_noxattr, Opt_directio, Opt_ignoreqv, /* Access options */ Opt_access, Opt_posixacl, /* Lock timeout option */ Opt_locktimeout, /* Error token */ Opt_err }; static const match_table_t tokens = { {Opt_debug, "debug=%x"}, {Opt_dfltuid, "dfltuid=%u"}, {Opt_dfltgid, "dfltgid=%u"}, {Opt_afid, "afid=%u"}, {Opt_uname, "uname=%s"}, {Opt_remotename, "aname=%s"}, {Opt_nodevmap, "nodevmap"}, {Opt_noxattr, "noxattr"}, {Opt_directio, "directio"}, {Opt_ignoreqv, "ignoreqv"}, {Opt_cache, "cache=%s"}, {Opt_cachetag, "cachetag=%s"}, {Opt_access, "access=%s"}, {Opt_posixacl, "posixacl"}, {Opt_locktimeout, "locktimeout=%u"}, {Opt_err, NULL} }; /* Interpret mount options for cache mode */ static int get_cache_mode(char *s) { int version = -EINVAL; if (!strcmp(s, "loose")) { version = CACHE_SC_LOOSE; p9_debug(P9_DEBUG_9P, "Cache mode: loose\n"); } else if (!strcmp(s, "fscache")) { version = CACHE_SC_FSCACHE; p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n"); } else if (!strcmp(s, "mmap")) { version = CACHE_SC_MMAP; p9_debug(P9_DEBUG_9P, "Cache mode: mmap\n"); } else if (!strcmp(s, "readahead")) { version = CACHE_SC_READAHEAD; p9_debug(P9_DEBUG_9P, "Cache mode: readahead\n"); } else if (!strcmp(s, "none")) { version = CACHE_SC_NONE; p9_debug(P9_DEBUG_9P, "Cache mode: none\n"); } else if (kstrtoint(s, 0, &version) != 0) { version = -EINVAL; pr_info("Unknown Cache mode or invalid value %s\n", s); } return version; } /* * Display the mount options in /proc/mounts. */ int v9fs_show_options(struct seq_file *m, struct dentry *root) { struct v9fs_session_info *v9ses = root->d_sb->s_fs_info; if (v9ses->debug) seq_printf(m, ",debug=%x", v9ses->debug); if (!uid_eq(v9ses->dfltuid, V9FS_DEFUID)) seq_printf(m, ",dfltuid=%u", from_kuid_munged(&init_user_ns, v9ses->dfltuid)); if (!gid_eq(v9ses->dfltgid, V9FS_DEFGID)) seq_printf(m, ",dfltgid=%u", from_kgid_munged(&init_user_ns, v9ses->dfltgid)); if (v9ses->afid != ~0) seq_printf(m, ",afid=%u", v9ses->afid); if (strcmp(v9ses->uname, V9FS_DEFUSER) != 0) seq_printf(m, ",uname=%s", v9ses->uname); if (strcmp(v9ses->aname, V9FS_DEFANAME) != 0) seq_printf(m, ",aname=%s", v9ses->aname); if (v9ses->nodev) seq_puts(m, ",nodevmap"); if (v9ses->cache) seq_printf(m, ",cache=%x", v9ses->cache); #ifdef CONFIG_9P_FSCACHE if (v9ses->cachetag && (v9ses->cache & CACHE_FSCACHE)) seq_printf(m, ",cachetag=%s", v9ses->cachetag); #endif switch (v9ses->flags & V9FS_ACCESS_MASK) { case V9FS_ACCESS_USER: seq_puts(m, ",access=user"); break; case V9FS_ACCESS_ANY: seq_puts(m, ",access=any"); break; case V9FS_ACCESS_CLIENT: seq_puts(m, ",access=client"); break; case V9FS_ACCESS_SINGLE: seq_printf(m, ",access=%u", from_kuid_munged(&init_user_ns, v9ses->uid)); break; } if (v9ses->flags & V9FS_IGNORE_QV) seq_puts(m, ",ignoreqv"); if (v9ses->flags & V9FS_DIRECT_IO) seq_puts(m, ",directio"); if (v9ses->flags & V9FS_POSIX_ACL) seq_puts(m, ",posixacl"); if (v9ses->flags & V9FS_NO_XATTR) seq_puts(m, ",noxattr"); return p9_show_client_options(m, v9ses->clnt); } /** * v9fs_parse_options - parse mount options into session structure * @v9ses: existing v9fs session information * @opts: The mount option string * * Return 0 upon success, -ERRNO upon failure. */ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) { char *options, *tmp_options; substring_t args[MAX_OPT_ARGS]; char *p; int option = 0; char *s; int ret = 0; /* setup defaults */ v9ses->afid = ~0; v9ses->debug = 0; v9ses->cache = CACHE_NONE; #ifdef CONFIG_9P_FSCACHE v9ses->cachetag = NULL; #endif v9ses->session_lock_timeout = P9_LOCK_TIMEOUT; if (!opts) return 0; tmp_options = kstrdup(opts, GFP_KERNEL); if (!tmp_options) { ret = -ENOMEM; goto fail_option_alloc; } options = tmp_options; while ((p = strsep(&options, ",")) != NULL) { int token, r; if (!*p) continue; token = match_token(p, tokens, args); switch (token) { case Opt_debug: r = match_int(&args[0], &option); if (r < 0) { p9_debug(P9_DEBUG_ERROR, "integer field, but no integer?\n"); ret = r; } else { v9ses->debug = option; #ifdef CONFIG_NET_9P_DEBUG p9_debug_level = option; #endif } break; case Opt_dfltuid: r = match_int(&args[0], &option); if (r < 0) { p9_debug(P9_DEBUG_ERROR, "integer field, but no integer?\n"); ret = r; continue; } v9ses->dfltuid = make_kuid(current_user_ns(), option); if (!uid_valid(v9ses->dfltuid)) { p9_debug(P9_DEBUG_ERROR, "uid field, but not a uid?\n"); ret = -EINVAL; } break; case Opt_dfltgid: r = match_int(&args[0], &option); if (r < 0) { p9_debug(P9_DEBUG_ERROR, "integer field, but no integer?\n"); ret = r; continue; } v9ses->dfltgid = make_kgid(current_user_ns(), option); if (!gid_valid(v9ses->dfltgid)) { p9_debug(P9_DEBUG_ERROR, "gid field, but not a gid?\n"); ret = -EINVAL; } break; case Opt_afid: r = match_int(&args[0], &option); if (r < 0) { p9_debug(P9_DEBUG_ERROR, "integer field, but no integer?\n"); ret = r; } else { v9ses->afid = option; } break; case Opt_uname: kfree(v9ses->uname); v9ses->uname = match_strdup(&args[0]); if (!v9ses->uname) { ret = -ENOMEM; goto free_and_return; } break; case Opt_remotename: kfree(v9ses->aname); v9ses->aname = match_strdup(&args[0]); if (!v9ses->aname) { ret = -ENOMEM; goto free_and_return; } break; case Opt_nodevmap: v9ses->nodev = 1; break; case Opt_noxattr: v9ses->flags |= V9FS_NO_XATTR; break; case Opt_directio: v9ses->flags |= V9FS_DIRECT_IO; break; case Opt_ignoreqv: v9ses->flags |= V9FS_IGNORE_QV; break; case Opt_cachetag: #ifdef CONFIG_9P_FSCACHE kfree(v9ses->cachetag); v9ses->cachetag = match_strdup(&args[0]); if (!v9ses->cachetag) { ret = -ENOMEM; goto free_and_return; } #endif break; case Opt_cache: s = match_strdup(&args[0]); if (!s) { ret = -ENOMEM; p9_debug(P9_DEBUG_ERROR, "problem allocating copy of cache arg\n"); goto free_and_return; } r = get_cache_mode(s); if (r < 0) ret = r; else v9ses->cache = r; kfree(s); break; case Opt_access: s = match_strdup(&args[0]); if (!s) { ret = -ENOMEM; p9_debug(P9_DEBUG_ERROR, "problem allocating copy of access arg\n"); goto free_and_return; } v9ses->flags &= ~V9FS_ACCESS_MASK; if (strcmp(s, "user") == 0) v9ses->flags |= V9FS_ACCESS_USER; else if (strcmp(s, "any") == 0) v9ses->flags |= V9FS_ACCESS_ANY; else if (strcmp(s, "client") == 0) { v9ses->flags |= V9FS_ACCESS_CLIENT; } else { uid_t uid; v9ses->flags |= V9FS_ACCESS_SINGLE; r = kstrtouint(s, 10, &uid); if (r) { ret = r; pr_info("Unknown access argument %s: %d\n", s, r); kfree(s); continue; } v9ses->uid = make_kuid(current_user_ns(), uid); if (!uid_valid(v9ses->uid)) { ret = -EINVAL; pr_info("Unknown uid %s\n", s); } } kfree(s); break; case Opt_posixacl: #ifdef CONFIG_9P_FS_POSIX_ACL v9ses->flags |= V9FS_POSIX_ACL; #else p9_debug(P9_DEBUG_ERROR, "Not defined CONFIG_9P_FS_POSIX_ACL. Ignoring posixacl option\n"); #endif break; case Opt_locktimeout: r = match_int(&args[0], &option); if (r < 0) { p9_debug(P9_DEBUG_ERROR, "integer field, but no integer?\n"); ret = r; continue; } if (option < 1) { p9_debug(P9_DEBUG_ERROR, "locktimeout must be a greater than zero integer.\n"); ret = -EINVAL; continue; } v9ses->session_lock_timeout = (long)option * HZ; break; default: continue; } } free_and_return: kfree(tmp_options); fail_option_alloc: return ret; } /** * v9fs_session_init - initialize session * @v9ses: session information structure * @dev_name: device being mounted * @data: options * */ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, const char *dev_name, char *data) { struct p9_fid *fid; int rc = -ENOMEM; v9ses->uname = kstrdup(V9FS_DEFUSER, GFP_KERNEL); if (!v9ses->uname) goto err_names; v9ses->aname = kstrdup(V9FS_DEFANAME, GFP_KERNEL); if (!v9ses->aname) goto err_names; init_rwsem(&v9ses->rename_sem); v9ses->uid = INVALID_UID; v9ses->dfltuid = V9FS_DEFUID; v9ses->dfltgid = V9FS_DEFGID; v9ses->clnt = p9_client_create(dev_name, data); if (IS_ERR(v9ses->clnt)) { rc = PTR_ERR(v9ses->clnt); p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n"); goto err_names; } v9ses->flags = V9FS_ACCESS_USER; if (p9_is_proto_dotl(v9ses->clnt)) { v9ses->flags = V9FS_ACCESS_CLIENT; v9ses->flags |= V9FS_PROTO_2000L; } else if (p9_is_proto_dotu(v9ses->clnt)) { v9ses->flags |= V9FS_PROTO_2000U; } rc = v9fs_parse_options(v9ses, data); if (rc < 0) goto err_clnt; v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; if (!v9fs_proto_dotl(v9ses) && ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) { /* * We support ACCESS_CLIENT only for dotl. * Fall back to ACCESS_USER */ v9ses->flags &= ~V9FS_ACCESS_MASK; v9ses->flags |= V9FS_ACCESS_USER; } /*FIXME !! */ /* for legacy mode, fall back to V9FS_ACCESS_ANY */ if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) && ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) { v9ses->flags &= ~V9FS_ACCESS_MASK; v9ses->flags |= V9FS_ACCESS_ANY; v9ses->uid = INVALID_UID; } if (!v9fs_proto_dotl(v9ses) || !((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) { /* * We support ACL checks on clinet only if the protocol is * 9P2000.L and access is V9FS_ACCESS_CLIENT. */ v9ses->flags &= ~V9FS_ACL_MASK; } fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, INVALID_UID, v9ses->aname); if (IS_ERR(fid)) { rc = PTR_ERR(fid); p9_debug(P9_DEBUG_ERROR, "cannot attach\n"); goto err_clnt; } if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_SINGLE) fid->uid = v9ses->uid; else fid->uid = INVALID_UID; #ifdef CONFIG_9P_FSCACHE /* register the session for caching */ if (v9ses->cache & CACHE_FSCACHE) { rc = v9fs_cache_session_get_cookie(v9ses, dev_name); if (rc < 0) goto err_clnt; } #endif spin_lock(&v9fs_sessionlist_lock); list_add(&v9ses->slist, &v9fs_sessionlist); spin_unlock(&v9fs_sessionlist_lock); return fid; err_clnt: #ifdef CONFIG_9P_FSCACHE kfree(v9ses->cachetag); #endif p9_client_destroy(v9ses->clnt); err_names: kfree(v9ses->uname); kfree(v9ses->aname); return ERR_PTR(rc); } /** * v9fs_session_close - shutdown a session * @v9ses: session information structure * */ void v9fs_session_close(struct v9fs_session_info *v9ses) { if (v9ses->clnt) { p9_client_destroy(v9ses->clnt); v9ses->clnt = NULL; } #ifdef CONFIG_9P_FSCACHE fscache_relinquish_volume(v9fs_session_cache(v9ses), NULL, false); kfree(v9ses->cachetag); #endif kfree(v9ses->uname); kfree(v9ses->aname); spin_lock(&v9fs_sessionlist_lock); list_del(&v9ses->slist); spin_unlock(&v9fs_sessionlist_lock); } /** * v9fs_session_cancel - terminate a session * @v9ses: session to terminate * * mark transport as disconnected and cancel all pending requests. */ void v9fs_session_cancel(struct v9fs_session_info *v9ses) { p9_debug(P9_DEBUG_ERROR, "cancel session %p\n", v9ses); p9_client_disconnect(v9ses->clnt); } /** * v9fs_session_begin_cancel - Begin terminate of a session * @v9ses: session to terminate * * After this call we don't allow any request other than clunk. */ void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses) { p9_debug(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses); p9_client_begin_disconnect(v9ses->clnt); } static struct kobject *v9fs_kobj; #ifdef CONFIG_9P_FSCACHE /* * List caches associated with a session */ static ssize_t caches_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t n = 0, count = 0, limit = PAGE_SIZE; struct v9fs_session_info *v9ses; spin_lock(&v9fs_sessionlist_lock); list_for_each_entry(v9ses, &v9fs_sessionlist, slist) { if (v9ses->cachetag) { n = snprintf(buf, limit, "%s\n", v9ses->cachetag); if (n < 0) { count = n; break; } count += n; limit -= n; } } spin_unlock(&v9fs_sessionlist_lock); return count; } static struct kobj_attribute v9fs_attr_cache = __ATTR_RO(caches); #endif /* CONFIG_9P_FSCACHE */ static struct attribute *v9fs_attrs[] = { #ifdef CONFIG_9P_FSCACHE &v9fs_attr_cache.attr, #endif NULL, }; static const struct attribute_group v9fs_attr_group = { .attrs = v9fs_attrs, }; /** * v9fs_sysfs_init - Initialize the v9fs sysfs interface * */ static int __init v9fs_sysfs_init(void) { v9fs_kobj = kobject_create_and_add("9p", fs_kobj); if (!v9fs_kobj) return -ENOMEM; if (sysfs_create_group(v9fs_kobj, &v9fs_attr_group)) { kobject_put(v9fs_kobj); return -ENOMEM; } return 0; } /** * v9fs_sysfs_cleanup - Unregister the v9fs sysfs interface * */ static void v9fs_sysfs_cleanup(void) { sysfs_remove_group(v9fs_kobj, &v9fs_attr_group); kobject_put(v9fs_kobj); } static void v9fs_inode_init_once(void *foo) { struct v9fs_inode *v9inode = (struct v9fs_inode *)foo; memset(&v9inode->qid, 0, sizeof(v9inode->qid)); inode_init_once(&v9inode->netfs.inode); } /** * v9fs_init_inode_cache - initialize a cache for 9P * Returns 0 on success. */ static int v9fs_init_inode_cache(void) { v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache", sizeof(struct v9fs_inode), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD|SLAB_ACCOUNT), v9fs_inode_init_once); if (!v9fs_inode_cache) return -ENOMEM; return 0; } /** * v9fs_destroy_inode_cache - destroy the cache of 9P inode * */ static void v9fs_destroy_inode_cache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(v9fs_inode_cache); } static int v9fs_cache_register(void) { int ret; ret = v9fs_init_inode_cache(); if (ret < 0) return ret; return ret; } static void v9fs_cache_unregister(void) { v9fs_destroy_inode_cache(); } /** * init_v9fs - Initialize module * */ static int __init init_v9fs(void) { int err; pr_info("Installing v9fs 9p2000 file system support\n"); /* TODO: Setup list of registered trasnport modules */ err = v9fs_cache_register(); if (err < 0) { pr_err("Failed to register v9fs for caching\n"); return err; } err = v9fs_sysfs_init(); if (err < 0) { pr_err("Failed to register with sysfs\n"); goto out_cache; } err = register_filesystem(&v9fs_fs_type); if (err < 0) { pr_err("Failed to register filesystem\n"); goto out_sysfs_cleanup; } return 0; out_sysfs_cleanup: v9fs_sysfs_cleanup(); out_cache: v9fs_cache_unregister(); return err; } /** * exit_v9fs - shutdown module * */ static void __exit exit_v9fs(void) { v9fs_sysfs_cleanup(); v9fs_cache_unregister(); unregister_filesystem(&v9fs_fs_type); } module_init(init_v9fs) module_exit(exit_v9fs) MODULE_AUTHOR("Latchesar Ionkov <lucho@ionkov.net>"); MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>"); MODULE_AUTHOR("Ron Minnich <rminnich@lanl.gov>"); MODULE_DESCRIPTION("9P Client File System"); MODULE_LICENSE("GPL");
13 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 // SPDX-License-Identifier: GPL-2.0 /* File: fs/ext4/acl.h (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org> */ #include <linux/posix_acl_xattr.h> #define EXT4_ACL_VERSION 0x0001 typedef struct { __le16 e_tag; __le16 e_perm; __le32 e_id; } ext4_acl_entry; typedef struct { __le16 e_tag; __le16 e_perm; } ext4_acl_entry_short; typedef struct { __le32 a_version; } ext4_acl_header; static inline size_t ext4_acl_size(int count) { if (count <= 4) { return sizeof(ext4_acl_header) + count * sizeof(ext4_acl_entry_short); } else { return sizeof(ext4_acl_header) + 4 * sizeof(ext4_acl_entry_short) + (count - 4) * sizeof(ext4_acl_entry); } } static inline int ext4_acl_count(size_t size) { ssize_t s; size -= sizeof(ext4_acl_header); s = size - 4 * sizeof(ext4_acl_entry_short); if (s < 0) { if (size % sizeof(ext4_acl_entry_short)) return -1; return size / sizeof(ext4_acl_entry_short); } else { if (s % sizeof(ext4_acl_entry)) return -1; return s / sizeof(ext4_acl_entry) + 4; } } #ifdef CONFIG_EXT4_FS_POSIX_ACL /* acl.c */ struct posix_acl *ext4_get_acl(struct inode *inode, int type, bool rcu); int ext4_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT4_FS_POSIX_ACL */ #include <linux/sched.h> #define ext4_get_acl NULL #define ext4_set_acl NULL static inline int ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) { /* usually, the umask is applied by posix_acl_create(), but if ext4 ACL support is disabled at compile time, we need to do it here, because posix_acl_create() will never be called */ inode->i_mode &= ~current_umask(); return 0; } #endif /* CONFIG_EXT4_FS_POSIX_ACL */
6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/linux/if_team.h - Network team device driver header * Copyright (c) 2011 Jiri Pirko <jpirko@redhat.com> */ #ifndef _LINUX_IF_TEAM_H_ #define _LINUX_IF_TEAM_H_ #include <linux/netpoll.h> #include <net/sch_generic.h> #include <linux/types.h> #include <uapi/linux/if_team.h> struct team_pcpu_stats { u64_stats_t rx_packets; u64_stats_t rx_bytes; u64_stats_t rx_multicast; u64_stats_t tx_packets; u64_stats_t tx_bytes; struct u64_stats_sync syncp; u32 rx_dropped; u32 tx_dropped; u32 rx_nohandler; }; struct team; struct team_port { struct net_device *dev; struct hlist_node hlist; /* node in enabled ports hash list */ struct list_head list; /* node in ordinary list */ struct team *team; int index; /* index of enabled port. If disabled, it's set to -1 */ bool linkup; /* either state.linkup or user.linkup */ struct { bool linkup; u32 speed; u8 duplex; } state; /* Values set by userspace */ struct { bool linkup; bool linkup_enabled; } user; /* Custom gennetlink interface related flags */ bool changed; bool removed; /* * A place for storing original values of the device before it * become a port. */ struct { unsigned char dev_addr[MAX_ADDR_LEN]; unsigned int mtu; } orig; #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *np; #endif s32 priority; /* lower number ~ higher priority */ u16 queue_id; struct list_head qom_list; /* node in queue override mapping list */ struct rcu_head rcu; long mode_priv[]; }; static inline struct team_port *team_port_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler_data); } static inline bool team_port_enabled(struct team_port *port) { return port->index != -1; } static inline bool team_port_txable(struct team_port *port) { return port->linkup && team_port_enabled(port); } static inline bool team_port_dev_txable(const struct net_device *port_dev) { struct team_port *port; bool txable; rcu_read_lock(); port = team_port_get_rcu(port_dev); txable = port ? team_port_txable(port) : false; rcu_read_unlock(); return txable; } #ifdef CONFIG_NET_POLL_CONTROLLER static inline void team_netpoll_send_skb(struct team_port *port, struct sk_buff *skb) { netpoll_send_skb(port->np, skb); } #else static inline void team_netpoll_send_skb(struct team_port *port, struct sk_buff *skb) { } #endif struct team_mode_ops { int (*init)(struct team *team); void (*exit)(struct team *team); rx_handler_result_t (*receive)(struct team *team, struct team_port *port, struct sk_buff *skb); bool (*transmit)(struct team *team, struct sk_buff *skb); int (*port_enter)(struct team *team, struct team_port *port); void (*port_leave)(struct team *team, struct team_port *port); void (*port_change_dev_addr)(struct team *team, struct team_port *port); void (*port_enabled)(struct team *team, struct team_port *port); void (*port_disabled)(struct team *team, struct team_port *port); }; extern int team_modeop_port_enter(struct team *team, struct team_port *port); extern void team_modeop_port_change_dev_addr(struct team *team, struct team_port *port); enum team_option_type { TEAM_OPTION_TYPE_U32, TEAM_OPTION_TYPE_STRING, TEAM_OPTION_TYPE_BINARY, TEAM_OPTION_TYPE_BOOL, TEAM_OPTION_TYPE_S32, }; struct team_option_inst_info { u32 array_index; struct team_port *port; /* != NULL if per-port */ }; struct team_gsetter_ctx { union { u32 u32_val; const char *str_val; struct { const void *ptr; u32 len; } bin_val; bool bool_val; s32 s32_val; } data; struct team_option_inst_info *info; }; struct team_option { struct list_head list; const char *name; bool per_port; unsigned int array_size; /* != 0 means the option is array */ enum team_option_type type; void (*init)(struct team *team, struct team_option_inst_info *info); void (*getter)(struct team *team, struct team_gsetter_ctx *ctx); int (*setter)(struct team *team, struct team_gsetter_ctx *ctx); }; extern void team_option_inst_set_change(struct team_option_inst_info *opt_inst_info); extern void team_options_change_check(struct team *team); struct team_mode { const char *kind; struct module *owner; size_t priv_size; size_t port_priv_size; const struct team_mode_ops *ops; enum netdev_lag_tx_type lag_tx_type; }; #define TEAM_PORT_HASHBITS 4 #define TEAM_PORT_HASHENTRIES (1 << TEAM_PORT_HASHBITS) #define TEAM_MODE_PRIV_LONGS 4 #define TEAM_MODE_PRIV_SIZE (sizeof(long) * TEAM_MODE_PRIV_LONGS) struct team { struct net_device *dev; /* associated netdevice */ struct team_pcpu_stats __percpu *pcpu_stats; const struct header_ops *header_ops_cache; struct mutex lock; /* used for overall locking, e.g. port lists write */ /* * List of enabled ports and their count */ int en_port_count; struct hlist_head en_port_hlist[TEAM_PORT_HASHENTRIES]; struct list_head port_list; /* list of all ports */ struct list_head option_list; struct list_head option_inst_list; /* list of option instances */ const struct team_mode *mode; struct team_mode_ops ops; bool user_carrier_enabled; bool queue_override_enabled; struct list_head *qom_lists; /* array of queue override mapping lists */ bool port_mtu_change_allowed; bool notifier_ctx; struct { unsigned int count; unsigned int interval; /* in ms */ atomic_t count_pending; struct delayed_work dw; } notify_peers; struct { unsigned int count; unsigned int interval; /* in ms */ atomic_t count_pending; struct delayed_work dw; } mcast_rejoin; struct lock_class_key team_lock_key; long mode_priv[TEAM_MODE_PRIV_LONGS]; }; static inline int team_dev_queue_xmit(struct team *team, struct team_port *port, struct sk_buff *skb) { BUILD_BUG_ON(sizeof(skb->queue_mapping) != sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping)); skb_set_queue_mapping(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping); skb->dev = port->dev; if (unlikely(netpoll_tx_running(team->dev))) { team_netpoll_send_skb(port, skb); return 0; } return dev_queue_xmit(skb); } static inline struct hlist_head *team_port_index_hash(struct team *team, int port_index) { return &team->en_port_hlist[port_index & (TEAM_PORT_HASHENTRIES - 1)]; } static inline struct team_port *team_get_port_by_index(struct team *team, int port_index) { struct team_port *port; struct hlist_head *head = team_port_index_hash(team, port_index); hlist_for_each_entry(port, head, hlist) if (port->index == port_index) return port; return NULL; } static inline int team_num_to_port_index(struct team *team, unsigned int num) { int en_port_count = READ_ONCE(team->en_port_count); if (unlikely(!en_port_count)) return 0; return num % en_port_count; } static inline struct team_port *team_get_port_by_index_rcu(struct team *team, int port_index) { struct team_port *port; struct hlist_head *head = team_port_index_hash(team, port_index); hlist_for_each_entry_rcu(port, head, hlist) if (port->index == port_index) return port; return NULL; } static inline struct team_port * team_get_first_port_txable_rcu(struct team *team, struct team_port *port) { struct team_port *cur; if (likely(team_port_txable(port))) return port; cur = port; list_for_each_entry_continue_rcu(cur, &team->port_list, list) if (team_port_txable(cur)) return cur; list_for_each_entry_rcu(cur, &team->port_list, list) { if (cur == port) break; if (team_port_txable(cur)) return cur; } return NULL; } extern int team_options_register(struct team *team, const struct team_option *option, size_t option_count); extern void team_options_unregister(struct team *team, const struct team_option *option, size_t option_count); extern int team_mode_register(const struct team_mode *mode); extern void team_mode_unregister(const struct team_mode *mode); #define TEAM_DEFAULT_NUM_TX_QUEUES 16 #define TEAM_DEFAULT_NUM_RX_QUEUES 16 #define MODULE_ALIAS_TEAM_MODE(kind) MODULE_ALIAS("team-mode-" kind) #endif /* _LINUX_IF_TEAM_H_ */
1 1 1 1 3 12 3 2 1 1 11 4 5 4 5 3 4 10 9 1 6 2 14 14 12 2 2 14 10 2 14 13 3 2 14 2 11 2 3 3 1 2 40 41 20 20 42 1 41 1 37 3 40 41 1 1 1 1 1 1 4 4 3 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 // SPDX-License-Identifier: GPL-2.0-only /* * Media device * * Copyright (C) 2010 Nokia Corporation * * Contacts: Laurent Pinchart <laurent.pinchart@ideasonboard.com> * Sakari Ailus <sakari.ailus@iki.fi> */ #include <linux/compat.h> #include <linux/export.h> #include <linux/idr.h> #include <linux/ioctl.h> #include <linux/media.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/pci.h> #include <linux/usb.h> #include <linux/version.h> #include <media/media-device.h> #include <media/media-devnode.h> #include <media/media-entity.h> #include <media/media-request.h> #ifdef CONFIG_MEDIA_CONTROLLER /* * Legacy defines from linux/media.h. This is the only place we need this * so we just define it here. The media.h header doesn't expose it to the * kernel to prevent it from being used by drivers, but here (and only here!) * we need it to handle the legacy behavior. */ #define MEDIA_ENT_SUBTYPE_MASK 0x0000ffff #define MEDIA_ENT_T_DEVNODE_UNKNOWN (MEDIA_ENT_F_OLD_BASE | \ MEDIA_ENT_SUBTYPE_MASK) /* ----------------------------------------------------------------------------- * Userspace API */ static inline void __user *media_get_uptr(__u64 arg) { return (void __user *)(uintptr_t)arg; } static int media_device_open(struct file *filp) { return 0; } static int media_device_close(struct file *filp) { return 0; } static long media_device_get_info(struct media_device *dev, void *arg) { struct media_device_info *info = arg; memset(info, 0, sizeof(*info)); if (dev->driver_name[0]) strscpy(info->driver, dev->driver_name, sizeof(info->driver)); else strscpy(info->driver, dev->dev->driver->name, sizeof(info->driver)); strscpy(info->model, dev->model, sizeof(info->model)); strscpy(info->serial, dev->serial, sizeof(info->serial)); strscpy(info->bus_info, dev->bus_info, sizeof(info->bus_info)); info->media_version = LINUX_VERSION_CODE; info->driver_version = info->media_version; info->hw_revision = dev->hw_revision; return 0; } static struct media_entity *find_entity(struct media_device *mdev, u32 id) { struct media_entity *entity; int next = id & MEDIA_ENT_ID_FLAG_NEXT; id &= ~MEDIA_ENT_ID_FLAG_NEXT; media_device_for_each_entity(entity, mdev) { if (((media_entity_id(entity) == id) && !next) || ((media_entity_id(entity) > id) && next)) { return entity; } } return NULL; } static long media_device_enum_entities(struct media_device *mdev, void *arg) { struct media_entity_desc *entd = arg; struct media_entity *ent; ent = find_entity(mdev, entd->id); if (ent == NULL) return -EINVAL; memset(entd, 0, sizeof(*entd)); entd->id = media_entity_id(ent); if (ent->name) strscpy(entd->name, ent->name, sizeof(entd->name)); entd->type = ent->function; entd->revision = 0; /* Unused */ entd->flags = ent->flags; entd->group_id = 0; /* Unused */ entd->pads = ent->num_pads; entd->links = ent->num_links - ent->num_backlinks; /* * Workaround for a bug at media-ctl <= v1.10 that makes it to * do the wrong thing if the entity function doesn't belong to * either MEDIA_ENT_F_OLD_BASE or MEDIA_ENT_F_OLD_SUBDEV_BASE * Ranges. * * Non-subdevices are expected to be at the MEDIA_ENT_F_OLD_BASE, * or, otherwise, will be silently ignored by media-ctl when * printing the graphviz diagram. So, map them into the devnode * old range. */ if (ent->function < MEDIA_ENT_F_OLD_BASE || ent->function > MEDIA_ENT_F_TUNER) { if (is_media_entity_v4l2_subdev(ent)) entd->type = MEDIA_ENT_F_V4L2_SUBDEV_UNKNOWN; else if (ent->function != MEDIA_ENT_F_IO_V4L) entd->type = MEDIA_ENT_T_DEVNODE_UNKNOWN; } memcpy(&entd->raw, &ent->info, sizeof(ent->info)); return 0; } static void media_device_kpad_to_upad(const struct media_pad *kpad, struct media_pad_desc *upad) { upad->entity = media_entity_id(kpad->entity); upad->index = kpad->index; upad->flags = kpad->flags; } static long media_device_enum_links(struct media_device *mdev, void *arg) { struct media_links_enum *links = arg; struct media_entity *entity; entity = find_entity(mdev, links->entity); if (entity == NULL) return -EINVAL; if (links->pads) { unsigned int p; for (p = 0; p < entity->num_pads; p++) { struct media_pad_desc pad; memset(&pad, 0, sizeof(pad)); media_device_kpad_to_upad(&entity->pads[p], &pad); if (copy_to_user(&links->pads[p], &pad, sizeof(pad))) return -EFAULT; } } if (links->links) { struct media_link *link; struct media_link_desc __user *ulink_desc = links->links; list_for_each_entry(link, &entity->links, list) { struct media_link_desc klink_desc; /* Ignore backlinks. */ if (link->source->entity != entity) continue; memset(&klink_desc, 0, sizeof(klink_desc)); media_device_kpad_to_upad(link->source, &klink_desc.source); media_device_kpad_to_upad(link->sink, &klink_desc.sink); klink_desc.flags = link->flags; if (copy_to_user(ulink_desc, &klink_desc, sizeof(*ulink_desc))) return -EFAULT; ulink_desc++; } } memset(links->reserved, 0, sizeof(links->reserved)); return 0; } static long media_device_setup_link(struct media_device *mdev, void *arg) { struct media_link_desc *linkd = arg; struct media_link *link = NULL; struct media_entity *source; struct media_entity *sink; /* Find the source and sink entities and link. */ source = find_entity(mdev, linkd->source.entity); sink = find_entity(mdev, linkd->sink.entity); if (source == NULL || sink == NULL) return -EINVAL; if (linkd->source.index >= source->num_pads || linkd->sink.index >= sink->num_pads) return -EINVAL; link = media_entity_find_link(&source->pads[linkd->source.index], &sink->pads[linkd->sink.index]); if (link == NULL) return -EINVAL; memset(linkd->reserved, 0, sizeof(linkd->reserved)); /* Setup the link on both entities. */ return __media_entity_setup_link(link, linkd->flags); } static long media_device_get_topology(struct media_device *mdev, void *arg) { struct media_v2_topology *topo = arg; struct media_entity *entity; struct media_interface *intf; struct media_pad *pad; struct media_link *link; struct media_v2_entity kentity, __user *uentity; struct media_v2_interface kintf, __user *uintf; struct media_v2_pad kpad, __user *upad; struct media_v2_link klink, __user *ulink; unsigned int i; int ret = 0; topo->topology_version = mdev->topology_version; /* Get entities and number of entities */ i = 0; uentity = media_get_uptr(topo->ptr_entities); media_device_for_each_entity(entity, mdev) { i++; if (ret || !uentity) continue; if (i > topo->num_entities) { ret = -ENOSPC; continue; } /* Copy fields to userspace struct if not error */ memset(&kentity, 0, sizeof(kentity)); kentity.id = entity->graph_obj.id; kentity.function = entity->function; kentity.flags = entity->flags; strscpy(kentity.name, entity->name, sizeof(kentity.name)); if (copy_to_user(uentity, &kentity, sizeof(kentity))) ret = -EFAULT; uentity++; } topo->num_entities = i; topo->reserved1 = 0; /* Get interfaces and number of interfaces */ i = 0; uintf = media_get_uptr(topo->ptr_interfaces); media_device_for_each_intf(intf, mdev) { i++; if (ret || !uintf) continue; if (i > topo->num_interfaces) { ret = -ENOSPC; continue; } memset(&kintf, 0, sizeof(kintf)); /* Copy intf fields to userspace struct */ kintf.id = intf->graph_obj.id; kintf.intf_type = intf->type; kintf.flags = intf->flags; if (media_type(&intf->graph_obj) == MEDIA_GRAPH_INTF_DEVNODE) { struct media_intf_devnode *devnode; devnode = intf_to_devnode(intf); kintf.devnode.major = devnode->major; kintf.devnode.minor = devnode->minor; } if (copy_to_user(uintf, &kintf, sizeof(kintf))) ret = -EFAULT; uintf++; } topo->num_interfaces = i; topo->reserved2 = 0; /* Get pads and number of pads */ i = 0; upad = media_get_uptr(topo->ptr_pads); media_device_for_each_pad(pad, mdev) { i++; if (ret || !upad) continue; if (i > topo->num_pads) { ret = -ENOSPC; continue; } memset(&kpad, 0, sizeof(kpad)); /* Copy pad fields to userspace struct */ kpad.id = pad->graph_obj.id; kpad.entity_id = pad->entity->graph_obj.id; kpad.flags = pad->flags; kpad.index = pad->index; if (copy_to_user(upad, &kpad, sizeof(kpad))) ret = -EFAULT; upad++; } topo->num_pads = i; topo->reserved3 = 0; /* Get links and number of links */ i = 0; ulink = media_get_uptr(topo->ptr_links); media_device_for_each_link(link, mdev) { if (link->is_backlink) continue; i++; if (ret || !ulink) continue; if (i > topo->num_links) { ret = -ENOSPC; continue; } memset(&klink, 0, sizeof(klink)); /* Copy link fields to userspace struct */ klink.id = link->graph_obj.id; klink.source_id = link->gobj0->id; klink.sink_id = link->gobj1->id; klink.flags = link->flags; if (copy_to_user(ulink, &klink, sizeof(klink))) ret = -EFAULT; ulink++; } topo->num_links = i; topo->reserved4 = 0; return ret; } static long media_device_request_alloc(struct media_device *mdev, void *arg) { int *alloc_fd = arg; if (!mdev->ops || !mdev->ops->req_validate || !mdev->ops->req_queue) return -ENOTTY; return media_request_alloc(mdev, alloc_fd); } static long copy_arg_from_user(void *karg, void __user *uarg, unsigned int cmd) { if ((_IOC_DIR(cmd) & _IOC_WRITE) && copy_from_user(karg, uarg, _IOC_SIZE(cmd))) return -EFAULT; return 0; } static long copy_arg_to_user(void __user *uarg, void *karg, unsigned int cmd) { if ((_IOC_DIR(cmd) & _IOC_READ) && copy_to_user(uarg, karg, _IOC_SIZE(cmd))) return -EFAULT; return 0; } /* Do acquire the graph mutex */ #define MEDIA_IOC_FL_GRAPH_MUTEX BIT(0) #define MEDIA_IOC_ARG(__cmd, func, fl, from_user, to_user) \ [_IOC_NR(MEDIA_IOC_##__cmd)] = { \ .cmd = MEDIA_IOC_##__cmd, \ .fn = func, \ .flags = fl, \ .arg_from_user = from_user, \ .arg_to_user = to_user, \ } #define MEDIA_IOC(__cmd, func, fl) \ MEDIA_IOC_ARG(__cmd, func, fl, copy_arg_from_user, copy_arg_to_user) /* the table is indexed by _IOC_NR(cmd) */ struct media_ioctl_info { unsigned int cmd; unsigned short flags; long (*fn)(struct media_device *dev, void *arg); long (*arg_from_user)(void *karg, void __user *uarg, unsigned int cmd); long (*arg_to_user)(void __user *uarg, void *karg, unsigned int cmd); }; static const struct media_ioctl_info ioctl_info[] = { MEDIA_IOC(DEVICE_INFO, media_device_get_info, MEDIA_IOC_FL_GRAPH_MUTEX), MEDIA_IOC(ENUM_ENTITIES, media_device_enum_entities, MEDIA_IOC_FL_GRAPH_MUTEX), MEDIA_IOC(ENUM_LINKS, media_device_enum_links, MEDIA_IOC_FL_GRAPH_MUTEX), MEDIA_IOC(SETUP_LINK, media_device_setup_link, MEDIA_IOC_FL_GRAPH_MUTEX), MEDIA_IOC(G_TOPOLOGY, media_device_get_topology, MEDIA_IOC_FL_GRAPH_MUTEX), MEDIA_IOC(REQUEST_ALLOC, media_device_request_alloc, 0), }; static long media_device_ioctl(struct file *filp, unsigned int cmd, unsigned long __arg) { struct media_devnode *devnode = media_devnode_data(filp); struct media_device *dev = devnode->media_dev; const struct media_ioctl_info *info; void __user *arg = (void __user *)__arg; char __karg[256], *karg = __karg; long ret; if (_IOC_NR(cmd) >= ARRAY_SIZE(ioctl_info) || ioctl_info[_IOC_NR(cmd)].cmd != cmd) return -ENOIOCTLCMD; info = &ioctl_info[_IOC_NR(cmd)]; if (_IOC_SIZE(info->cmd) > sizeof(__karg)) { karg = kmalloc(_IOC_SIZE(info->cmd), GFP_KERNEL); if (!karg) return -ENOMEM; } if (info->arg_from_user) { ret = info->arg_from_user(karg, arg, cmd); if (ret) goto out_free; } if (info->flags & MEDIA_IOC_FL_GRAPH_MUTEX) mutex_lock(&dev->graph_mutex); ret = info->fn(dev, karg); if (info->flags & MEDIA_IOC_FL_GRAPH_MUTEX) mutex_unlock(&dev->graph_mutex); if (!ret && info->arg_to_user) ret = info->arg_to_user(arg, karg, cmd); out_free: if (karg != __karg) kfree(karg); return ret; } #ifdef CONFIG_COMPAT struct media_links_enum32 { __u32 entity; compat_uptr_t pads; /* struct media_pad_desc * */ compat_uptr_t links; /* struct media_link_desc * */ __u32 reserved[4]; }; static long media_device_enum_links32(struct media_device *mdev, struct media_links_enum32 __user *ulinks) { struct media_links_enum links; compat_uptr_t pads_ptr, links_ptr; int ret; memset(&links, 0, sizeof(links)); if (get_user(links.entity, &ulinks->entity) || get_user(pads_ptr, &ulinks->pads) || get_user(links_ptr, &ulinks->links)) return -EFAULT; links.pads = compat_ptr(pads_ptr); links.links = compat_ptr(links_ptr); ret = media_device_enum_links(mdev, &links); if (ret) return ret; if (copy_to_user(ulinks->reserved, links.reserved, sizeof(ulinks->reserved))) return -EFAULT; return 0; } #define MEDIA_IOC_ENUM_LINKS32 _IOWR('|', 0x02, struct media_links_enum32) static long media_device_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct media_devnode *devnode = media_devnode_data(filp); struct media_device *dev = devnode->media_dev; long ret; switch (cmd) { case MEDIA_IOC_ENUM_LINKS32: mutex_lock(&dev->graph_mutex); ret = media_device_enum_links32(dev, (struct media_links_enum32 __user *)arg); mutex_unlock(&dev->graph_mutex); break; default: return media_device_ioctl(filp, cmd, arg); } return ret; } #endif /* CONFIG_COMPAT */ static const struct media_file_operations media_device_fops = { .owner = THIS_MODULE, .open = media_device_open, .ioctl = media_device_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = media_device_compat_ioctl, #endif /* CONFIG_COMPAT */ .release = media_device_close, }; /* ----------------------------------------------------------------------------- * sysfs */ static ssize_t model_show(struct device *cd, struct device_attribute *attr, char *buf) { struct media_devnode *devnode = to_media_devnode(cd); struct media_device *mdev = devnode->media_dev; return sprintf(buf, "%.*s\n", (int)sizeof(mdev->model), mdev->model); } static DEVICE_ATTR_RO(model); /* ----------------------------------------------------------------------------- * Registration/unregistration */ static void media_device_release(struct media_devnode *devnode) { dev_dbg(devnode->parent, "Media device released\n"); } static void __media_device_unregister_entity(struct media_entity *entity) { struct media_device *mdev = entity->graph_obj.mdev; struct media_link *link, *tmp; struct media_interface *intf; struct media_pad *iter; ida_free(&mdev->entity_internal_idx, entity->internal_idx); /* Remove all interface links pointing to this entity */ list_for_each_entry(intf, &mdev->interfaces, graph_obj.list) { list_for_each_entry_safe(link, tmp, &intf->links, list) { if (link->entity == entity) __media_remove_intf_link(link); } } /* Remove all data links that belong to this entity */ __media_entity_remove_links(entity); /* Remove all pads that belong to this entity */ media_entity_for_each_pad(entity, iter) media_gobj_destroy(&iter->graph_obj); /* Remove the entity */ media_gobj_destroy(&entity->graph_obj); /* invoke entity_notify callbacks to handle entity removal?? */ } int __must_check media_device_register_entity(struct media_device *mdev, struct media_entity *entity) { struct media_entity_notify *notify, *next; struct media_pad *iter; int ret; if (entity->function == MEDIA_ENT_F_V4L2_SUBDEV_UNKNOWN || entity->function == MEDIA_ENT_F_UNKNOWN) dev_warn(mdev->dev, "Entity type for entity %s was not initialized!\n", entity->name); /* Warn if we apparently re-register an entity */ WARN_ON(entity->graph_obj.mdev != NULL); entity->graph_obj.mdev = mdev; INIT_LIST_HEAD(&entity->links); entity->num_links = 0; entity->num_backlinks = 0; ret = ida_alloc_min(&mdev->entity_internal_idx, 1, GFP_KERNEL); if (ret < 0) return ret; entity->internal_idx = ret; mutex_lock(&mdev->graph_mutex); mdev->entity_internal_idx_max = max(mdev->entity_internal_idx_max, entity->internal_idx); /* Initialize media_gobj embedded at the entity */ media_gobj_create(mdev, MEDIA_GRAPH_ENTITY, &entity->graph_obj); /* Initialize objects at the pads */ media_entity_for_each_pad(entity, iter) media_gobj_create(mdev, MEDIA_GRAPH_PAD, &iter->graph_obj); /* invoke entity_notify callbacks */ list_for_each_entry_safe(notify, next, &mdev->entity_notify, list) notify->notify(entity, notify->notify_data); if (mdev->entity_internal_idx_max >= mdev->pm_count_walk.ent_enum.idx_max) { struct media_graph new = { .top = 0 }; /* * Initialise the new graph walk before cleaning up * the old one in order not to spoil the graph walk * object of the media device if graph walk init fails. */ ret = media_graph_walk_init(&new, mdev); if (ret) { __media_device_unregister_entity(entity); mutex_unlock(&mdev->graph_mutex); return ret; } media_graph_walk_cleanup(&mdev->pm_count_walk); mdev->pm_count_walk = new; } mutex_unlock(&mdev->graph_mutex); return 0; } EXPORT_SYMBOL_GPL(media_device_register_entity); void media_device_unregister_entity(struct media_entity *entity) { struct media_device *mdev = entity->graph_obj.mdev; if (mdev == NULL) return; mutex_lock(&mdev->graph_mutex); __media_device_unregister_entity(entity); mutex_unlock(&mdev->graph_mutex); } EXPORT_SYMBOL_GPL(media_device_unregister_entity); void media_device_init(struct media_device *mdev) { INIT_LIST_HEAD(&mdev->entities); INIT_LIST_HEAD(&mdev->interfaces); INIT_LIST_HEAD(&mdev->pads); INIT_LIST_HEAD(&mdev->links); INIT_LIST_HEAD(&mdev->entity_notify); mutex_init(&mdev->req_queue_mutex); mutex_init(&mdev->graph_mutex); ida_init(&mdev->entity_internal_idx); atomic_set(&mdev->request_id, 0); if (!*mdev->bus_info) media_set_bus_info(mdev->bus_info, sizeof(mdev->bus_info), mdev->dev); dev_dbg(mdev->dev, "Media device initialized\n"); } EXPORT_SYMBOL_GPL(media_device_init); void media_device_cleanup(struct media_device *mdev) { ida_destroy(&mdev->entity_internal_idx); mdev->entity_internal_idx_max = 0; media_graph_walk_cleanup(&mdev->pm_count_walk); mutex_destroy(&mdev->graph_mutex); mutex_destroy(&mdev->req_queue_mutex); } EXPORT_SYMBOL_GPL(media_device_cleanup); int __must_check __media_device_register(struct media_device *mdev, struct module *owner) { struct media_devnode *devnode; int ret; devnode = kzalloc(sizeof(*devnode), GFP_KERNEL); if (!devnode) return -ENOMEM; /* Register the device node. */ mdev->devnode = devnode; devnode->fops = &media_device_fops; devnode->parent = mdev->dev; devnode->release = media_device_release; /* Set version 0 to indicate user-space that the graph is static */ mdev->topology_version = 0; ret = media_devnode_register(mdev, devnode, owner); if (ret < 0) { /* devnode free is handled in media_devnode_*() */ mdev->devnode = NULL; return ret; } ret = device_create_file(&devnode->dev, &dev_attr_model); if (ret < 0) { /* devnode free is handled in media_devnode_*() */ mdev->devnode = NULL; media_devnode_unregister_prepare(devnode); media_devnode_unregister(devnode); return ret; } dev_dbg(mdev->dev, "Media device registered\n"); return 0; } EXPORT_SYMBOL_GPL(__media_device_register); void media_device_register_entity_notify(struct media_device *mdev, struct media_entity_notify *nptr) { mutex_lock(&mdev->graph_mutex); list_add_tail(&nptr->list, &mdev->entity_notify); mutex_unlock(&mdev->graph_mutex); } EXPORT_SYMBOL_GPL(media_device_register_entity_notify); /* * Note: Should be called with mdev->lock held. */ static void __media_device_unregister_entity_notify(struct media_device *mdev, struct media_entity_notify *nptr) { list_del(&nptr->list); } void media_device_unregister_entity_notify(struct media_device *mdev, struct media_entity_notify *nptr) { mutex_lock(&mdev->graph_mutex); __media_device_unregister_entity_notify(mdev, nptr); mutex_unlock(&mdev->graph_mutex); } EXPORT_SYMBOL_GPL(media_device_unregister_entity_notify); void media_device_unregister(struct media_device *mdev) { struct media_entity *entity; struct media_entity *next; struct media_interface *intf, *tmp_intf; struct media_entity_notify *notify, *nextp; if (mdev == NULL) return; mutex_lock(&mdev->graph_mutex); /* Check if mdev was ever registered at all */ if (!media_devnode_is_registered(mdev->devnode)) { mutex_unlock(&mdev->graph_mutex); return; } /* Clear the devnode register bit to avoid races with media dev open */ media_devnode_unregister_prepare(mdev->devnode); /* Remove all entities from the media device */ list_for_each_entry_safe(entity, next, &mdev->entities, graph_obj.list) __media_device_unregister_entity(entity); /* Remove all entity_notify callbacks from the media device */ list_for_each_entry_safe(notify, nextp, &mdev->entity_notify, list) __media_device_unregister_entity_notify(mdev, notify); /* Remove all interfaces from the media device */ list_for_each_entry_safe(intf, tmp_intf, &mdev->interfaces, graph_obj.list) { /* * Unlink the interface, but don't free it here; the * module which created it is responsible for freeing * it */ __media_remove_intf_links(intf); media_gobj_destroy(&intf->graph_obj); } mutex_unlock(&mdev->graph_mutex); dev_dbg(mdev->dev, "Media device unregistered\n"); device_remove_file(&mdev->devnode->dev, &dev_attr_model); media_devnode_unregister(mdev->devnode); /* devnode free is handled in media_devnode_*() */ mdev->devnode = NULL; } EXPORT_SYMBOL_GPL(media_device_unregister); #if IS_ENABLED(CONFIG_PCI) void media_device_pci_init(struct media_device *mdev, struct pci_dev *pci_dev, const char *name) { mdev->dev = &pci_dev->dev; if (name) strscpy(mdev->model, name, sizeof(mdev->model)); else strscpy(mdev->model, pci_name(pci_dev), sizeof(mdev->model)); sprintf(mdev->bus_info, "PCI:%s", pci_name(pci_dev)); mdev->hw_revision = (pci_dev->subsystem_vendor << 16) | pci_dev->subsystem_device; media_device_init(mdev); } EXPORT_SYMBOL_GPL(media_device_pci_init); #endif #if IS_ENABLED(CONFIG_USB) void __media_device_usb_init(struct media_device *mdev, struct usb_device *udev, const char *board_name, const char *driver_name) { mdev->dev = &udev->dev; if (driver_name) strscpy(mdev->driver_name, driver_name, sizeof(mdev->driver_name)); if (board_name) strscpy(mdev->model, board_name, sizeof(mdev->model)); else if (udev->product) strscpy(mdev->model, udev->product, sizeof(mdev->model)); else strscpy(mdev->model, "unknown model", sizeof(mdev->model)); if (udev->serial) strscpy(mdev->serial, udev->serial, sizeof(mdev->serial)); usb_make_path(udev, mdev->bus_info, sizeof(mdev->bus_info)); mdev->hw_revision = le16_to_cpu(udev->descriptor.bcdDevice); media_device_init(mdev); } EXPORT_SYMBOL_GPL(__media_device_usb_init); #endif #endif /* CONFIG_MEDIA_CONTROLLER */
7 7 5 1 2 7 2 3 2 2 2 2 2 2 1 1 3 3 2 1 1 1 1 2 4 4 4 17 2 2 1 1 2 1 4 2 1 1 1 1 2 5 3 1 1 2 4 2 2 1 1 2 4 2 1 2 2 1 5 5 2 3 1 5 35 1 2 1 5 1 4 5 15 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2009 Red Hat, Inc. * Author: Michael S. Tsirkin <mst@redhat.com> * * virtio-net server in host kernel. */ #include <linux/compat.h> #include <linux/eventfd.h> #include <linux/vhost.h> #include <linux/virtio_net.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/mutex.h> #include <linux/workqueue.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/sched/clock.h> #include <linux/sched/signal.h> #include <linux/vmalloc.h> #include <linux/net.h> #include <linux/if_packet.h> #include <linux/if_arp.h> #include <linux/if_tun.h> #include <linux/if_macvlan.h> #include <linux/if_tap.h> #include <linux/if_vlan.h> #include <linux/skb_array.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/xdp.h> #include "vhost.h" static int experimental_zcopytx = 0; module_param(experimental_zcopytx, int, 0444); MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;" " 1 -Enable; 0 - Disable"); /* Max number of bytes transferred before requeueing the job. * Using this limit prevents one virtqueue from starving others. */ #define VHOST_NET_WEIGHT 0x80000 /* Max number of packets transferred before requeueing the job. * Using this limit prevents one virtqueue from starving others with small * pkts. */ #define VHOST_NET_PKT_WEIGHT 256 /* MAX number of TX used buffers for outstanding zerocopy */ #define VHOST_MAX_PEND 128 #define VHOST_GOODCOPY_LEN 256 /* * For transmit, used buffer len is unused; we override it to track buffer * status internally; used for zerocopy tx only. */ /* Lower device DMA failed */ #define VHOST_DMA_FAILED_LEN ((__force __virtio32)3) /* Lower device DMA done */ #define VHOST_DMA_DONE_LEN ((__force __virtio32)2) /* Lower device DMA in progress */ #define VHOST_DMA_IN_PROGRESS ((__force __virtio32)1) /* Buffer unused */ #define VHOST_DMA_CLEAR_LEN ((__force __virtio32)0) #define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN) enum { VHOST_NET_FEATURES = VHOST_FEATURES | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_ACCESS_PLATFORM) | (1ULL << VIRTIO_F_RING_RESET) }; enum { VHOST_NET_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2) }; enum { VHOST_NET_VQ_RX = 0, VHOST_NET_VQ_TX = 1, VHOST_NET_VQ_MAX = 2, }; struct vhost_net_ubuf_ref { /* refcount follows semantics similar to kref: * 0: object is released * 1: no outstanding ubufs * >1: outstanding ubufs */ atomic_t refcount; wait_queue_head_t wait; struct vhost_virtqueue *vq; }; #define VHOST_NET_BATCH 64 struct vhost_net_buf { void **queue; int tail; int head; }; struct vhost_net_virtqueue { struct vhost_virtqueue vq; size_t vhost_hlen; size_t sock_hlen; /* vhost zerocopy support fields below: */ /* last used idx for outstanding DMA zerocopy buffers */ int upend_idx; /* For TX, first used idx for DMA done zerocopy buffers * For RX, number of batched heads */ int done_idx; /* Number of XDP frames batched */ int batched_xdp; /* an array of userspace buffers info */ struct ubuf_info_msgzc *ubuf_info; /* Reference counting for outstanding ubufs. * Protected by vq mutex. Writers must also take device mutex. */ struct vhost_net_ubuf_ref *ubufs; struct ptr_ring *rx_ring; struct vhost_net_buf rxq; /* Batched XDP buffs */ struct xdp_buff *xdp; }; struct vhost_net { struct vhost_dev dev; struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX]; struct vhost_poll poll[VHOST_NET_VQ_MAX]; /* Number of TX recently submitted. * Protected by tx vq lock. */ unsigned tx_packets; /* Number of times zerocopy TX recently failed. * Protected by tx vq lock. */ unsigned tx_zcopy_err; /* Flush in progress. Protected by tx vq lock. */ bool tx_flush; /* Private page frag */ struct page_frag page_frag; /* Refcount bias of page frag */ int refcnt_bias; }; static unsigned vhost_net_zcopy_mask __read_mostly; static void *vhost_net_buf_get_ptr(struct vhost_net_buf *rxq) { if (rxq->tail != rxq->head) return rxq->queue[rxq->head]; else return NULL; } static int vhost_net_buf_get_size(struct vhost_net_buf *rxq) { return rxq->tail - rxq->head; } static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq) { return rxq->tail == rxq->head; } static void *vhost_net_buf_consume(struct vhost_net_buf *rxq) { void *ret = vhost_net_buf_get_ptr(rxq); ++rxq->head; return ret; } static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq) { struct vhost_net_buf *rxq = &nvq->rxq; rxq->head = 0; rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue, VHOST_NET_BATCH); return rxq->tail; } static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq) { struct vhost_net_buf *rxq = &nvq->rxq; if (nvq->rx_ring && !vhost_net_buf_is_empty(rxq)) { ptr_ring_unconsume(nvq->rx_ring, rxq->queue + rxq->head, vhost_net_buf_get_size(rxq), tun_ptr_free); rxq->head = rxq->tail = 0; } } static int vhost_net_buf_peek_len(void *ptr) { if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); return xdpf->len; } return __skb_array_len_with_tag(ptr); } static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq) { struct vhost_net_buf *rxq = &nvq->rxq; if (!vhost_net_buf_is_empty(rxq)) goto out; if (!vhost_net_buf_produce(nvq)) return 0; out: return vhost_net_buf_peek_len(vhost_net_buf_get_ptr(rxq)); } static void vhost_net_buf_init(struct vhost_net_buf *rxq) { rxq->head = rxq->tail = 0; } static void vhost_net_enable_zcopy(int vq) { vhost_net_zcopy_mask |= 0x1 << vq; } static struct vhost_net_ubuf_ref * vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy) { struct vhost_net_ubuf_ref *ubufs; /* No zero copy backend? Nothing to count. */ if (!zcopy) return NULL; ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL); if (!ubufs) return ERR_PTR(-ENOMEM); atomic_set(&ubufs->refcount, 1); init_waitqueue_head(&ubufs->wait); ubufs->vq = vq; return ubufs; } static int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs) { int r = atomic_sub_return(1, &ubufs->refcount); if (unlikely(!r)) wake_up(&ubufs->wait); return r; } static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs) { vhost_net_ubuf_put(ubufs); wait_event(ubufs->wait, !atomic_read(&ubufs->refcount)); } static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs) { vhost_net_ubuf_put_and_wait(ubufs); kfree(ubufs); } static void vhost_net_clear_ubuf_info(struct vhost_net *n) { int i; for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { kfree(n->vqs[i].ubuf_info); n->vqs[i].ubuf_info = NULL; } } static int vhost_net_set_ubuf_info(struct vhost_net *n) { bool zcopy; int i; for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { zcopy = vhost_net_zcopy_mask & (0x1 << i); if (!zcopy) continue; n->vqs[i].ubuf_info = kmalloc_array(UIO_MAXIOV, sizeof(*n->vqs[i].ubuf_info), GFP_KERNEL); if (!n->vqs[i].ubuf_info) goto err; } return 0; err: vhost_net_clear_ubuf_info(n); return -ENOMEM; } static void vhost_net_vq_reset(struct vhost_net *n) { int i; vhost_net_clear_ubuf_info(n); for (i = 0; i < VHOST_NET_VQ_MAX; i++) { n->vqs[i].done_idx = 0; n->vqs[i].upend_idx = 0; n->vqs[i].ubufs = NULL; n->vqs[i].vhost_hlen = 0; n->vqs[i].sock_hlen = 0; vhost_net_buf_init(&n->vqs[i].rxq); } } static void vhost_net_tx_packet(struct vhost_net *net) { ++net->tx_packets; if (net->tx_packets < 1024) return; net->tx_packets = 0; net->tx_zcopy_err = 0; } static void vhost_net_tx_err(struct vhost_net *net) { ++net->tx_zcopy_err; } static bool vhost_net_tx_select_zcopy(struct vhost_net *net) { /* TX flush waits for outstanding DMAs to be done. * Don't start new DMAs. */ return !net->tx_flush && net->tx_packets / 64 >= net->tx_zcopy_err; } static bool vhost_sock_zcopy(struct socket *sock) { return unlikely(experimental_zcopytx) && sock_flag(sock->sk, SOCK_ZEROCOPY); } static bool vhost_sock_xdp(struct socket *sock) { return sock_flag(sock->sk, SOCK_XDP); } /* In case of DMA done not in order in lower device driver for some reason. * upend_idx is used to track end of used idx, done_idx is used to track head * of used idx. Once lower device DMA done contiguously, we will signal KVM * guest used idx. */ static void vhost_zerocopy_signal_used(struct vhost_net *net, struct vhost_virtqueue *vq) { struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); int i, add; int j = 0; for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) { if (vq->heads[i].len == VHOST_DMA_FAILED_LEN) vhost_net_tx_err(net); if (VHOST_DMA_IS_DONE(vq->heads[i].len)) { vq->heads[i].len = VHOST_DMA_CLEAR_LEN; ++j; } else break; } while (j) { add = min(UIO_MAXIOV - nvq->done_idx, j); vhost_add_used_and_signal_n(vq->dev, vq, &vq->heads[nvq->done_idx], add); nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV; j -= add; } } static void vhost_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *ubuf_base, bool success) { struct ubuf_info_msgzc *ubuf = uarg_to_msgzc(ubuf_base); struct vhost_net_ubuf_ref *ubufs = ubuf->ctx; struct vhost_virtqueue *vq = ubufs->vq; int cnt; rcu_read_lock_bh(); /* set len to mark this desc buffers done DMA */ vq->heads[ubuf->desc].len = success ? VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN; cnt = vhost_net_ubuf_put(ubufs); /* * Trigger polling thread if guest stopped submitting new buffers: * in this case, the refcount after decrement will eventually reach 1. * We also trigger polling periodically after each 16 packets * (the value 16 here is more or less arbitrary, it's tuned to trigger * less than 10% of times). */ if (cnt <= 1 || !(cnt % 16)) vhost_poll_queue(&vq->poll); rcu_read_unlock_bh(); } static inline unsigned long busy_clock(void) { return local_clock() >> 10; } static bool vhost_can_busy_poll(unsigned long endtime) { return likely(!need_resched() && !time_after(busy_clock(), endtime) && !signal_pending(current)); } static void vhost_net_disable_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); struct vhost_poll *poll = n->poll + (nvq - n->vqs); if (!vhost_vq_get_backend(vq)) return; vhost_poll_stop(poll); } static int vhost_net_enable_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); struct vhost_poll *poll = n->poll + (nvq - n->vqs); struct socket *sock; sock = vhost_vq_get_backend(vq); if (!sock) return 0; return vhost_poll_start(poll, sock->file); } static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq) { struct vhost_virtqueue *vq = &nvq->vq; struct vhost_dev *dev = vq->dev; if (!nvq->done_idx) return; vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx); nvq->done_idx = 0; } static void vhost_tx_batch(struct vhost_net *net, struct vhost_net_virtqueue *nvq, struct socket *sock, struct msghdr *msghdr) { struct tun_msg_ctl ctl = { .type = TUN_MSG_PTR, .num = nvq->batched_xdp, .ptr = nvq->xdp, }; int i, err; if (nvq->batched_xdp == 0) goto signal_used; msghdr->msg_control = &ctl; msghdr->msg_controllen = sizeof(ctl); err = sock->ops->sendmsg(sock, msghdr, 0); if (unlikely(err < 0)) { vq_err(&nvq->vq, "Fail to batch sending packets\n"); /* free pages owned by XDP; since this is an unlikely error path, * keep it simple and avoid more complex bulk update for the * used pages */ for (i = 0; i < nvq->batched_xdp; ++i) put_page(virt_to_head_page(nvq->xdp[i].data)); nvq->batched_xdp = 0; nvq->done_idx = 0; return; } signal_used: vhost_net_signal_used(nvq); nvq->batched_xdp = 0; } static int sock_has_rx_data(struct socket *sock) { if (unlikely(!sock)) return 0; if (sock->ops->peek_len) return sock->ops->peek_len(sock); return skb_queue_empty(&sock->sk->sk_receive_queue); } static void vhost_net_busy_poll_try_queue(struct vhost_net *net, struct vhost_virtqueue *vq) { if (!vhost_vq_avail_empty(&net->dev, vq)) { vhost_poll_queue(&vq->poll); } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { vhost_disable_notify(&net->dev, vq); vhost_poll_queue(&vq->poll); } } static void vhost_net_busy_poll(struct vhost_net *net, struct vhost_virtqueue *rvq, struct vhost_virtqueue *tvq, bool *busyloop_intr, bool poll_rx) { unsigned long busyloop_timeout; unsigned long endtime; struct socket *sock; struct vhost_virtqueue *vq = poll_rx ? tvq : rvq; /* Try to hold the vq mutex of the paired virtqueue. We can't * use mutex_lock() here since we could not guarantee a * consistenet lock ordering. */ if (!mutex_trylock(&vq->mutex)) return; vhost_disable_notify(&net->dev, vq); sock = vhost_vq_get_backend(rvq); busyloop_timeout = poll_rx ? rvq->busyloop_timeout: tvq->busyloop_timeout; preempt_disable(); endtime = busy_clock() + busyloop_timeout; while (vhost_can_busy_poll(endtime)) { if (vhost_vq_has_work(vq)) { *busyloop_intr = true; break; } if ((sock_has_rx_data(sock) && !vhost_vq_avail_empty(&net->dev, rvq)) || !vhost_vq_avail_empty(&net->dev, tvq)) break; cpu_relax(); } preempt_enable(); if (poll_rx || sock_has_rx_data(sock)) vhost_net_busy_poll_try_queue(net, vq); else if (!poll_rx) /* On tx here, sock has no rx data. */ vhost_enable_notify(&net->dev, rvq); mutex_unlock(&vq->mutex); } static int vhost_net_tx_get_vq_desc(struct vhost_net *net, struct vhost_net_virtqueue *tnvq, unsigned int *out_num, unsigned int *in_num, struct msghdr *msghdr, bool *busyloop_intr) { struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_virtqueue *rvq = &rnvq->vq; struct vhost_virtqueue *tvq = &tnvq->vq; int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), out_num, in_num, NULL, NULL); if (r == tvq->num && tvq->busyloop_timeout) { /* Flush batched packets first */ if (!vhost_sock_zcopy(vhost_vq_get_backend(tvq))) vhost_tx_batch(net, tnvq, vhost_vq_get_backend(tvq), msghdr); vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false); r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), out_num, in_num, NULL, NULL); } return r; } static bool vhost_exceeds_maxpend(struct vhost_net *net) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; return (nvq->upend_idx + UIO_MAXIOV - nvq->done_idx) % UIO_MAXIOV > min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2); } static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter, size_t hdr_size, int out) { /* Skip header. TODO: support TSO. */ size_t len = iov_length(vq->iov, out); iov_iter_init(iter, ITER_SOURCE, vq->iov, out, len); iov_iter_advance(iter, hdr_size); return iov_iter_count(iter); } static int get_tx_bufs(struct vhost_net *net, struct vhost_net_virtqueue *nvq, struct msghdr *msg, unsigned int *out, unsigned int *in, size_t *len, bool *busyloop_intr) { struct vhost_virtqueue *vq = &nvq->vq; int ret; ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr); if (ret < 0 || ret == vq->num) return ret; if (*in) { vq_err(vq, "Unexpected descriptor format for TX: out %d, int %d\n", *out, *in); return -EFAULT; } /* Sanity check */ *len = init_iov_iter(vq, &msg->msg_iter, nvq->vhost_hlen, *out); if (*len == 0) { vq_err(vq, "Unexpected header len for TX: %zd expected %zd\n", *len, nvq->vhost_hlen); return -EFAULT; } return ret; } static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len) { return total_len < VHOST_NET_WEIGHT && !vhost_vq_avail_empty(vq->dev, vq); } static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz, struct page_frag *pfrag, gfp_t gfp) { if (pfrag->page) { if (pfrag->offset + sz <= pfrag->size) return true; __page_frag_cache_drain(pfrag->page, net->refcnt_bias); } pfrag->offset = 0; net->refcnt_bias = 0; if (SKB_FRAG_PAGE_ORDER) { /* Avoid direct reclaim but allow kswapd to wake */ pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY, SKB_FRAG_PAGE_ORDER); if (likely(pfrag->page)) { pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; goto done; } } pfrag->page = alloc_page(gfp); if (likely(pfrag->page)) { pfrag->size = PAGE_SIZE; goto done; } return false; done: net->refcnt_bias = USHRT_MAX; page_ref_add(pfrag->page, USHRT_MAX - 1); return true; } #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, struct iov_iter *from) { struct vhost_virtqueue *vq = &nvq->vq; struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); struct socket *sock = vhost_vq_get_backend(vq); struct page_frag *alloc_frag = &net->page_frag; struct virtio_net_hdr *gso; struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp]; struct tun_xdp_hdr *hdr; size_t len = iov_iter_count(from); int headroom = vhost_sock_xdp(sock) ? XDP_PACKET_HEADROOM : 0; int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + headroom + nvq->sock_hlen); int sock_hlen = nvq->sock_hlen; void *buf; int copied; if (unlikely(len < nvq->sock_hlen)) return -EFAULT; if (SKB_DATA_ALIGN(len + pad) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE) return -ENOSPC; buflen += SKB_DATA_ALIGN(len + pad); alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES); if (unlikely(!vhost_net_page_frag_refill(net, buflen, alloc_frag, GFP_KERNEL))) return -ENOMEM; buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; copied = copy_page_from_iter(alloc_frag->page, alloc_frag->offset + offsetof(struct tun_xdp_hdr, gso), sock_hlen, from); if (copied != sock_hlen) return -EFAULT; hdr = buf; gso = &hdr->gso; if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && vhost16_to_cpu(vq, gso->csum_start) + vhost16_to_cpu(vq, gso->csum_offset) + 2 > vhost16_to_cpu(vq, gso->hdr_len)) { gso->hdr_len = cpu_to_vhost16(vq, vhost16_to_cpu(vq, gso->csum_start) + vhost16_to_cpu(vq, gso->csum_offset) + 2); if (vhost16_to_cpu(vq, gso->hdr_len) > len) return -EINVAL; } len -= sock_hlen; copied = copy_page_from_iter(alloc_frag->page, alloc_frag->offset + pad, len, from); if (copied != len) return -EFAULT; xdp_init_buff(xdp, buflen, NULL); xdp_prepare_buff(xdp, buf, pad, len, true); hdr->buflen = buflen; --net->refcnt_bias; alloc_frag->offset += buflen; ++nvq->batched_xdp; return 0; } static void handle_tx_copy(struct vhost_net *net, struct socket *sock) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; unsigned out, in; int head; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_control = NULL, .msg_controllen = 0, .msg_flags = MSG_DONTWAIT, }; size_t len, total_len = 0; int err; int sent_pkts = 0; bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX); do { bool busyloop_intr = false; if (nvq->done_idx == VHOST_NET_BATCH) vhost_tx_batch(net, nvq, sock, &msg); head = get_tx_bufs(net, nvq, &msg, &out, &in, &len, &busyloop_intr); /* On error, stop handling until the next kick. */ if (unlikely(head < 0)) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { if (unlikely(busyloop_intr)) { vhost_poll_queue(&vq->poll); } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { vhost_disable_notify(&net->dev, vq); continue; } break; } total_len += len; /* For simplicity, TX batching is only enabled if * sndbuf is unlimited. */ if (sock_can_batch) { err = vhost_net_build_xdp(nvq, &msg.msg_iter); if (!err) { goto done; } else if (unlikely(err != -ENOSPC)) { vhost_tx_batch(net, nvq, sock, &msg); vhost_discard_vq_desc(vq, 1); vhost_net_enable_vq(net, vq); break; } /* We can't build XDP buff, go for single * packet path but let's flush batched * packets. */ vhost_tx_batch(net, nvq, sock, &msg); msg.msg_control = NULL; } else { if (tx_can_batch(vq, total_len)) msg.msg_flags |= MSG_MORE; else msg.msg_flags &= ~MSG_MORE; } err = sock->ops->sendmsg(sock, &msg, len); if (unlikely(err < 0)) { if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) { vhost_discard_vq_desc(vq, 1); vhost_net_enable_vq(net, vq); break; } pr_debug("Fail to send packet: err %d", err); } else if (unlikely(err != len)) pr_debug("Truncated TX packet: len %d != %zd\n", err, len); done: vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head); vq->heads[nvq->done_idx].len = 0; ++nvq->done_idx; } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); vhost_tx_batch(net, nvq, sock, &msg); } static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; unsigned out, in; int head; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_control = NULL, .msg_controllen = 0, .msg_flags = MSG_DONTWAIT, }; struct tun_msg_ctl ctl; size_t len, total_len = 0; int err; struct vhost_net_ubuf_ref *ubufs; struct ubuf_info_msgzc *ubuf; bool zcopy_used; int sent_pkts = 0; do { bool busyloop_intr; /* Release DMAs done buffers first */ vhost_zerocopy_signal_used(net, vq); busyloop_intr = false; head = get_tx_bufs(net, nvq, &msg, &out, &in, &len, &busyloop_intr); /* On error, stop handling until the next kick. */ if (unlikely(head < 0)) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { if (unlikely(busyloop_intr)) { vhost_poll_queue(&vq->poll); } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { vhost_disable_notify(&net->dev, vq); continue; } break; } zcopy_used = len >= VHOST_GOODCOPY_LEN && !vhost_exceeds_maxpend(net) && vhost_net_tx_select_zcopy(net); /* use msg_control to pass vhost zerocopy ubuf info to skb */ if (zcopy_used) { ubuf = nvq->ubuf_info + nvq->upend_idx; vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head); vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS; ubuf->ctx = nvq->ubufs; ubuf->desc = nvq->upend_idx; ubuf->ubuf.callback = vhost_zerocopy_callback; ubuf->ubuf.flags = SKBFL_ZEROCOPY_FRAG; refcount_set(&ubuf->ubuf.refcnt, 1); msg.msg_control = &ctl; ctl.type = TUN_MSG_UBUF; ctl.ptr = &ubuf->ubuf; msg.msg_controllen = sizeof(ctl); ubufs = nvq->ubufs; atomic_inc(&ubufs->refcount); nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV; } else { msg.msg_control = NULL; ubufs = NULL; } total_len += len; if (tx_can_batch(vq, total_len) && likely(!vhost_exceeds_maxpend(net))) { msg.msg_flags |= MSG_MORE; } else { msg.msg_flags &= ~MSG_MORE; } err = sock->ops->sendmsg(sock, &msg, len); if (unlikely(err < 0)) { bool retry = err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS; if (zcopy_used) { if (vq->heads[ubuf->desc].len == VHOST_DMA_IN_PROGRESS) vhost_net_ubuf_put(ubufs); if (retry) nvq->upend_idx = ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV; else vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN; } if (retry) { vhost_discard_vq_desc(vq, 1); vhost_net_enable_vq(net, vq); break; } pr_debug("Fail to send packet: err %d", err); } else if (unlikely(err != len)) pr_debug("Truncated TX packet: " " len %d != %zd\n", err, len); if (!zcopy_used) vhost_add_used_and_signal(&net->dev, vq, head, 0); else vhost_zerocopy_signal_used(net, vq); vhost_net_tx_packet(net); } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); } /* Expects to be always run from workqueue - which acts as * read-size critical section for our kind of RCU. */ static void handle_tx(struct vhost_net *net) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; struct socket *sock; mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX); sock = vhost_vq_get_backend(vq); if (!sock) goto out; if (!vq_meta_prefetch(vq)) goto out; vhost_disable_notify(&net->dev, vq); vhost_net_disable_vq(net, vq); if (vhost_sock_zcopy(sock)) handle_tx_zerocopy(net, sock); else handle_tx_copy(net, sock); out: mutex_unlock(&vq->mutex); } static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk) { struct sk_buff *head; int len = 0; unsigned long flags; if (rvq->rx_ring) return vhost_net_buf_peek(rvq); spin_lock_irqsave(&sk->sk_receive_queue.lock, flags); head = skb_peek(&sk->sk_receive_queue); if (likely(head)) { len = head->len; if (skb_vlan_tag_present(head)) len += VLAN_HLEN; } spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags); return len; } static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk, bool *busyloop_intr) { struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *rvq = &rnvq->vq; struct vhost_virtqueue *tvq = &tnvq->vq; int len = peek_head_len(rnvq, sk); if (!len && rvq->busyloop_timeout) { /* Flush batched heads first */ vhost_net_signal_used(rnvq); /* Both tx vq and rx socket were polled here */ vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true); len = peek_head_len(rnvq, sk); } return len; } /* This is a multi-buffer version of vhost_get_desc, that works if * vq has read descriptors only. * @vq - the relevant virtqueue * @datalen - data length we'll be reading * @iovcount - returned count of io vectors we fill * @log - vhost log * @log_num - log offset * @quota - headcount quota, 1 for big buffer * returns number of buffer heads allocated, negative on error */ static int get_rx_bufs(struct vhost_virtqueue *vq, struct vring_used_elem *heads, int datalen, unsigned *iovcount, struct vhost_log *log, unsigned *log_num, unsigned int quota) { unsigned int out, in; int seg = 0; int headcount = 0; unsigned d; int r, nlogs = 0; /* len is always initialized before use since we are always called with * datalen > 0. */ u32 len; while (datalen > 0 && headcount < quota) { if (unlikely(seg >= UIO_MAXIOV)) { r = -ENOBUFS; goto err; } r = vhost_get_vq_desc(vq, vq->iov + seg, ARRAY_SIZE(vq->iov) - seg, &out, &in, log, log_num); if (unlikely(r < 0)) goto err; d = r; if (d == vq->num) { r = 0; goto err; } if (unlikely(out || in <= 0)) { vq_err(vq, "unexpected descriptor format for RX: " "out %d, in %d\n", out, in); r = -EINVAL; goto err; } if (unlikely(log)) { nlogs += *log_num; log += *log_num; } heads[headcount].id = cpu_to_vhost32(vq, d); len = iov_length(vq->iov + seg, in); heads[headcount].len = cpu_to_vhost32(vq, len); datalen -= len; ++headcount; seg += in; } heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen); *iovcount = seg; if (unlikely(log)) *log_num = nlogs; /* Detect overrun */ if (unlikely(datalen > 0)) { r = UIO_MAXIOV + 1; goto err; } return headcount; err: vhost_discard_vq_desc(vq, headcount); return r; } /* Expects to be always run from workqueue - which acts as * read-size critical section for our kind of RCU. */ static void handle_rx(struct vhost_net *net) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_virtqueue *vq = &nvq->vq; unsigned in, log; struct vhost_log *vq_log; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_control = NULL, /* FIXME: get and handle RX aux data. */ .msg_controllen = 0, .msg_flags = MSG_DONTWAIT, }; struct virtio_net_hdr hdr = { .flags = 0, .gso_type = VIRTIO_NET_HDR_GSO_NONE }; size_t total_len = 0; int err, mergeable; s16 headcount; size_t vhost_hlen, sock_hlen; size_t vhost_len, sock_len; bool busyloop_intr = false; struct socket *sock; struct iov_iter fixup; __virtio16 num_buffers; int recv_pkts = 0; mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX); sock = vhost_vq_get_backend(vq); if (!sock) goto out; if (!vq_meta_prefetch(vq)) goto out; vhost_disable_notify(&net->dev, vq); vhost_net_disable_vq(net, vq); vhost_hlen = nvq->vhost_hlen; sock_hlen = nvq->sock_hlen; vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? vq->log : NULL; mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); do { sock_len = vhost_net_rx_peek_head_len(net, sock->sk, &busyloop_intr); if (!sock_len) break; sock_len += sock_hlen; vhost_len = sock_len + vhost_hlen; headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx, vhost_len, &in, vq_log, &log, likely(mergeable) ? UIO_MAXIOV : 1); /* On error, stop handling until the next kick. */ if (unlikely(headcount < 0)) goto out; /* OK, now we need to know about added descriptors. */ if (!headcount) { if (unlikely(busyloop_intr)) { vhost_poll_queue(&vq->poll); } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { /* They have slipped one in as we were * doing that: check again. */ vhost_disable_notify(&net->dev, vq); continue; } /* Nothing new? Wait for eventfd to tell us * they refilled. */ goto out; } busyloop_intr = false; if (nvq->rx_ring) msg.msg_control = vhost_net_buf_consume(&nvq->rxq); /* On overrun, truncate and discard */ if (unlikely(headcount > UIO_MAXIOV)) { iov_iter_init(&msg.msg_iter, ITER_DEST, vq->iov, 1, 1); err = sock->ops->recvmsg(sock, &msg, 1, MSG_DONTWAIT | MSG_TRUNC); pr_debug("Discarded rx packet: len %zd\n", sock_len); continue; } /* We don't need to be notified again. */ iov_iter_init(&msg.msg_iter, ITER_DEST, vq->iov, in, vhost_len); fixup = msg.msg_iter; if (unlikely((vhost_hlen))) { /* We will supply the header ourselves * TODO: support TSO. */ iov_iter_advance(&msg.msg_iter, vhost_hlen); } err = sock->ops->recvmsg(sock, &msg, sock_len, MSG_DONTWAIT | MSG_TRUNC); /* Userspace might have consumed the packet meanwhile: * it's not supposed to do this usually, but might be hard * to prevent. Discard data we got (if any) and keep going. */ if (unlikely(err != sock_len)) { pr_debug("Discarded rx packet: " " len %d, expected %zd\n", err, sock_len); vhost_discard_vq_desc(vq, headcount); continue; } /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */ if (unlikely(vhost_hlen)) { if (copy_to_iter(&hdr, sizeof(hdr), &fixup) != sizeof(hdr)) { vq_err(vq, "Unable to write vnet_hdr " "at addr %p\n", vq->iov->iov_base); goto out; } } else { /* Header came from socket; we'll need to patch * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF */ iov_iter_advance(&fixup, sizeof(hdr)); } /* TODO: Should check and handle checksum. */ num_buffers = cpu_to_vhost16(vq, headcount); if (likely(mergeable) && copy_to_iter(&num_buffers, sizeof num_buffers, &fixup) != sizeof num_buffers) { vq_err(vq, "Failed num_buffers write"); vhost_discard_vq_desc(vq, headcount); goto out; } nvq->done_idx += headcount; if (nvq->done_idx > VHOST_NET_BATCH) vhost_net_signal_used(nvq); if (unlikely(vq_log)) vhost_log_write(vq, vq_log, log, vhost_len, vq->iov, in); total_len += vhost_len; } while (likely(!vhost_exceeds_weight(vq, ++recv_pkts, total_len))); if (unlikely(busyloop_intr)) vhost_poll_queue(&vq->poll); else if (!sock_len) vhost_net_enable_vq(net, vq); out: vhost_net_signal_used(nvq); mutex_unlock(&vq->mutex); } static void handle_tx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, poll.work); struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); handle_tx(net); } static void handle_rx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, poll.work); struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); handle_rx(net); } static void handle_tx_net(struct vhost_work *work) { struct vhost_net *net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work); handle_tx(net); } static void handle_rx_net(struct vhost_work *work) { struct vhost_net *net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work); handle_rx(net); } static int vhost_net_open(struct inode *inode, struct file *f) { struct vhost_net *n; struct vhost_dev *dev; struct vhost_virtqueue **vqs; void **queue; struct xdp_buff *xdp; int i; n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!n) return -ENOMEM; vqs = kmalloc_array(VHOST_NET_VQ_MAX, sizeof(*vqs), GFP_KERNEL); if (!vqs) { kvfree(n); return -ENOMEM; } queue = kmalloc_array(VHOST_NET_BATCH, sizeof(void *), GFP_KERNEL); if (!queue) { kfree(vqs); kvfree(n); return -ENOMEM; } n->vqs[VHOST_NET_VQ_RX].rxq.queue = queue; xdp = kmalloc_array(VHOST_NET_BATCH, sizeof(*xdp), GFP_KERNEL); if (!xdp) { kfree(vqs); kvfree(n); kfree(queue); return -ENOMEM; } n->vqs[VHOST_NET_VQ_TX].xdp = xdp; dev = &n->dev; vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq; vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq; n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick; n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick; for (i = 0; i < VHOST_NET_VQ_MAX; i++) { n->vqs[i].ubufs = NULL; n->vqs[i].ubuf_info = NULL; n->vqs[i].upend_idx = 0; n->vqs[i].done_idx = 0; n->vqs[i].batched_xdp = 0; n->vqs[i].vhost_hlen = 0; n->vqs[i].sock_hlen = 0; n->vqs[i].rx_ring = NULL; vhost_net_buf_init(&n->vqs[i].rxq); } vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX, UIO_MAXIOV + VHOST_NET_BATCH, VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT, true, NULL); vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev, vqs[VHOST_NET_VQ_TX]); vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev, vqs[VHOST_NET_VQ_RX]); f->private_data = n; n->page_frag.page = NULL; n->refcnt_bias = 0; return 0; } static struct socket *vhost_net_stop_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct socket *sock; struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); mutex_lock(&vq->mutex); sock = vhost_vq_get_backend(vq); vhost_net_disable_vq(n, vq); vhost_vq_set_backend(vq, NULL); vhost_net_buf_unproduce(nvq); nvq->rx_ring = NULL; mutex_unlock(&vq->mutex); return sock; } static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock, struct socket **rx_sock) { *tx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_TX].vq); *rx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_RX].vq); } static void vhost_net_flush(struct vhost_net *n) { vhost_dev_flush(&n->dev); if (n->vqs[VHOST_NET_VQ_TX].ubufs) { mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); n->tx_flush = true; mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); /* Wait for all lower device DMAs done. */ vhost_net_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs); mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); n->tx_flush = false; atomic_set(&n->vqs[VHOST_NET_VQ_TX].ubufs->refcount, 1); mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); } } static int vhost_net_release(struct inode *inode, struct file *f) { struct vhost_net *n = f->private_data; struct socket *tx_sock; struct socket *rx_sock; vhost_net_stop(n, &tx_sock, &rx_sock); vhost_net_flush(n); vhost_dev_stop(&n->dev); vhost_dev_cleanup(&n->dev); vhost_net_vq_reset(n); if (tx_sock) sockfd_put(tx_sock); if (rx_sock) sockfd_put(rx_sock); /* Make sure no callbacks are outstanding */ synchronize_rcu(); /* We do an extra flush before freeing memory, * since jobs can re-queue themselves. */ vhost_net_flush(n); kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue); kfree(n->vqs[VHOST_NET_VQ_TX].xdp); kfree(n->dev.vqs); if (n->page_frag.page) __page_frag_cache_drain(n->page_frag.page, n->refcnt_bias); kvfree(n); return 0; } static struct socket *get_raw_socket(int fd) { int r; struct socket *sock = sockfd_lookup(fd, &r); if (!sock) return ERR_PTR(-ENOTSOCK); /* Parameter checking */ if (sock->sk->sk_type != SOCK_RAW) { r = -ESOCKTNOSUPPORT; goto err; } if (sock->sk->sk_family != AF_PACKET) { r = -EPFNOSUPPORT; goto err; } return sock; err: sockfd_put(sock); return ERR_PTR(r); } static struct ptr_ring *get_tap_ptr_ring(struct file *file) { struct ptr_ring *ring; ring = tun_get_tx_ring(file); if (!IS_ERR(ring)) goto out; ring = tap_get_ptr_ring(file); if (!IS_ERR(ring)) goto out; ring = NULL; out: return ring; } static struct socket *get_tap_socket(int fd) { struct file *file = fget(fd); struct socket *sock; if (!file) return ERR_PTR(-EBADF); sock = tun_get_socket(file); if (!IS_ERR(sock)) return sock; sock = tap_get_socket(file); if (IS_ERR(sock)) fput(file); return sock; } static struct socket *get_socket(int fd) { struct socket *sock; /* special case to disable backend */ if (fd == -1) return NULL; sock = get_raw_socket(fd); if (!IS_ERR(sock)) return sock; sock = get_tap_socket(fd); if (!IS_ERR(sock)) return sock; return ERR_PTR(-ENOTSOCK); } static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) { struct socket *sock, *oldsock; struct vhost_virtqueue *vq; struct vhost_net_virtqueue *nvq; struct vhost_net_ubuf_ref *ubufs, *oldubufs = NULL; int r; mutex_lock(&n->dev.mutex); r = vhost_dev_check_owner(&n->dev); if (r) goto err; if (index >= VHOST_NET_VQ_MAX) { r = -ENOBUFS; goto err; } vq = &n->vqs[index].vq; nvq = &n->vqs[index]; mutex_lock(&vq->mutex); if (fd == -1) vhost_clear_msg(&n->dev); /* Verify that ring has been setup correctly. */ if (!vhost_vq_access_ok(vq)) { r = -EFAULT; goto err_vq; } sock = get_socket(fd); if (IS_ERR(sock)) { r = PTR_ERR(sock); goto err_vq; } /* start polling new socket */ oldsock = vhost_vq_get_backend(vq); if (sock != oldsock) { ubufs = vhost_net_ubuf_alloc(vq, sock && vhost_sock_zcopy(sock)); if (IS_ERR(ubufs)) { r = PTR_ERR(ubufs); goto err_ubufs; } vhost_net_disable_vq(n, vq); vhost_vq_set_backend(vq, sock); vhost_net_buf_unproduce(nvq); r = vhost_vq_init_access(vq); if (r) goto err_used; r = vhost_net_enable_vq(n, vq); if (r) goto err_used; if (index == VHOST_NET_VQ_RX) { if (sock) nvq->rx_ring = get_tap_ptr_ring(sock->file); else nvq->rx_ring = NULL; } oldubufs = nvq->ubufs; nvq->ubufs = ubufs; n->tx_packets = 0; n->tx_zcopy_err = 0; n->tx_flush = false; } mutex_unlock(&vq->mutex); if (oldubufs) { vhost_net_ubuf_put_wait_and_free(oldubufs); mutex_lock(&vq->mutex); vhost_zerocopy_signal_used(n, vq); mutex_unlock(&vq->mutex); } if (oldsock) { vhost_dev_flush(&n->dev); sockfd_put(oldsock); } mutex_unlock(&n->dev.mutex); return 0; err_used: vhost_vq_set_backend(vq, oldsock); vhost_net_enable_vq(n, vq); if (ubufs) vhost_net_ubuf_put_wait_and_free(ubufs); err_ubufs: if (sock) sockfd_put(sock); err_vq: mutex_unlock(&vq->mutex); err: mutex_unlock(&n->dev.mutex); return r; } static long vhost_net_reset_owner(struct vhost_net *n) { struct socket *tx_sock = NULL; struct socket *rx_sock = NULL; long err; struct vhost_iotlb *umem; mutex_lock(&n->dev.mutex); err = vhost_dev_check_owner(&n->dev); if (err) goto done; umem = vhost_dev_reset_owner_prepare(); if (!umem) { err = -ENOMEM; goto done; } vhost_net_stop(n, &tx_sock, &rx_sock); vhost_net_flush(n); vhost_dev_stop(&n->dev); vhost_dev_reset_owner(&n->dev, umem); vhost_net_vq_reset(n); done: mutex_unlock(&n->dev.mutex); if (tx_sock) sockfd_put(tx_sock); if (rx_sock) sockfd_put(rx_sock); return err; } static int vhost_net_set_features(struct vhost_net *n, u64 features) { size_t vhost_hlen, sock_hlen, hdr_len; int i; hdr_len = (features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) ? sizeof(struct virtio_net_hdr_mrg_rxbuf) : sizeof(struct virtio_net_hdr); if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) { /* vhost provides vnet_hdr */ vhost_hlen = hdr_len; sock_hlen = 0; } else { /* socket provides vnet_hdr */ vhost_hlen = 0; sock_hlen = hdr_len; } mutex_lock(&n->dev.mutex); if ((features & (1 << VHOST_F_LOG_ALL)) && !vhost_log_access_ok(&n->dev)) goto out_unlock; if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) { if (vhost_init_device_iotlb(&n->dev)) goto out_unlock; } for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { mutex_lock(&n->vqs[i].vq.mutex); n->vqs[i].vq.acked_features = features; n->vqs[i].vhost_hlen = vhost_hlen; n->vqs[i].sock_hlen = sock_hlen; mutex_unlock(&n->vqs[i].vq.mutex); } mutex_unlock(&n->dev.mutex); return 0; out_unlock: mutex_unlock(&n->dev.mutex); return -EFAULT; } static long vhost_net_set_owner(struct vhost_net *n) { int r; mutex_lock(&n->dev.mutex); if (vhost_dev_has_owner(&n->dev)) { r = -EBUSY; goto out; } r = vhost_net_set_ubuf_info(n); if (r) goto out; r = vhost_dev_set_owner(&n->dev); if (r) vhost_net_clear_ubuf_info(n); vhost_net_flush(n); out: mutex_unlock(&n->dev.mutex); return r; } static long vhost_net_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) { struct vhost_net *n = f->private_data; void __user *argp = (void __user *)arg; u64 __user *featurep = argp; struct vhost_vring_file backend; u64 features; int r; switch (ioctl) { case VHOST_NET_SET_BACKEND: if (copy_from_user(&backend, argp, sizeof backend)) return -EFAULT; return vhost_net_set_backend(n, backend.index, backend.fd); case VHOST_GET_FEATURES: features = VHOST_NET_FEATURES; if (copy_to_user(featurep, &features, sizeof features)) return -EFAULT; return 0; case VHOST_SET_FEATURES: if (copy_from_user(&features, featurep, sizeof features)) return -EFAULT; if (features & ~VHOST_NET_FEATURES) return -EOPNOTSUPP; return vhost_net_set_features(n, features); case VHOST_GET_BACKEND_FEATURES: features = VHOST_NET_BACKEND_FEATURES; if (copy_to_user(featurep, &features, sizeof(features))) return -EFAULT; return 0; case VHOST_SET_BACKEND_FEATURES: if (copy_from_user(&features, featurep, sizeof(features))) return -EFAULT; if (features & ~VHOST_NET_BACKEND_FEATURES) return -EOPNOTSUPP; vhost_set_backend_features(&n->dev, features); return 0; case VHOST_RESET_OWNER: return vhost_net_reset_owner(n); case VHOST_SET_OWNER: return vhost_net_set_owner(n); default: mutex_lock(&n->dev.mutex); r = vhost_dev_ioctl(&n->dev, ioctl, argp); if (r == -ENOIOCTLCMD) r = vhost_vring_ioctl(&n->dev, ioctl, argp); else vhost_net_flush(n); mutex_unlock(&n->dev.mutex); return r; } } static ssize_t vhost_net_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct vhost_net *n = file->private_data; struct vhost_dev *dev = &n->dev; int noblock = file->f_flags & O_NONBLOCK; return vhost_chr_read_iter(dev, to, noblock); } static ssize_t vhost_net_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct vhost_net *n = file->private_data; struct vhost_dev *dev = &n->dev; return vhost_chr_write_iter(dev, from); } static __poll_t vhost_net_chr_poll(struct file *file, poll_table *wait) { struct vhost_net *n = file->private_data; struct vhost_dev *dev = &n->dev; return vhost_chr_poll(file, dev, wait); } static const struct file_operations vhost_net_fops = { .owner = THIS_MODULE, .release = vhost_net_release, .read_iter = vhost_net_chr_read_iter, .write_iter = vhost_net_chr_write_iter, .poll = vhost_net_chr_poll, .unlocked_ioctl = vhost_net_ioctl, .compat_ioctl = compat_ptr_ioctl, .open = vhost_net_open, .llseek = noop_llseek, }; static struct miscdevice vhost_net_misc = { .minor = VHOST_NET_MINOR, .name = "vhost-net", .fops = &vhost_net_fops, }; static int __init vhost_net_init(void) { if (experimental_zcopytx) vhost_net_enable_zcopy(VHOST_NET_VQ_TX); return misc_register(&vhost_net_misc); } module_init(vhost_net_init); static void __exit vhost_net_exit(void) { misc_deregister(&vhost_net_misc); } module_exit(vhost_net_exit); MODULE_VERSION("0.0.1"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Michael S. Tsirkin"); MODULE_DESCRIPTION("Host kernel accelerator for virtio net"); MODULE_ALIAS_MISCDEV(VHOST_NET_MINOR); MODULE_ALIAS("devname:vhost-net");
62 61 1 1 17 17 5 5 68 68 12 12 14 14 2 2 4 4 1 1 173 3 32 201 76 76 11 1 1 3 6 2 2 2 3 4 4 4 3 3 16 1 1 2 11 1 5 1 1 2 1 8 1 1 2 2 2 10 4 6 1 3 7 6 1 1 2 1 1 1 1 8 1 1 2 3 1 5 1 8 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 /* * net/tipc/node.c: TIPC node management routines * * Copyright (c) 2000-2006, 2012-2016, Ericsson AB * Copyright (c) 2005-2006, 2010-2014, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "link.h" #include "node.h" #include "name_distr.h" #include "socket.h" #include "bcast.h" #include "monitor.h" #include "discover.h" #include "netlink.h" #include "trace.h" #include "crypto.h" #define INVALID_NODE_SIG 0x10000 #define NODE_CLEANUP_AFTER 300000 /* Flags used to take different actions according to flag type * TIPC_NOTIFY_NODE_DOWN: notify node is down * TIPC_NOTIFY_NODE_UP: notify node is up * TIPC_DISTRIBUTE_NAME: publish or withdraw link state name type */ enum { TIPC_NOTIFY_NODE_DOWN = (1 << 3), TIPC_NOTIFY_NODE_UP = (1 << 4), TIPC_NOTIFY_LINK_UP = (1 << 6), TIPC_NOTIFY_LINK_DOWN = (1 << 7) }; struct tipc_link_entry { struct tipc_link *link; spinlock_t lock; /* per link */ u32 mtu; struct sk_buff_head inputq; struct tipc_media_addr maddr; }; struct tipc_bclink_entry { struct tipc_link *link; struct sk_buff_head inputq1; struct sk_buff_head arrvq; struct sk_buff_head inputq2; struct sk_buff_head namedq; u16 named_rcv_nxt; bool named_open; }; /** * struct tipc_node - TIPC node structure * @addr: network address of node * @kref: reference counter to node object * @lock: rwlock governing access to structure * @net: the applicable net namespace * @hash: links to adjacent nodes in unsorted hash chain * @inputq: pointer to input queue containing messages for msg event * @namedq: pointer to name table input queue with name table messages * @active_links: bearer ids of active links, used as index into links[] array * @links: array containing references to all links to node * @bc_entry: broadcast link entry * @action_flags: bit mask of different types of node actions * @state: connectivity state vs peer node * @preliminary: a preliminary node or not * @failover_sent: failover sent or not * @sync_point: sequence number where synch/failover is finished * @list: links to adjacent nodes in sorted list of cluster's nodes * @working_links: number of working links to node (both active and standby) * @link_cnt: number of links to node * @capabilities: bitmap, indicating peer node's functional capabilities * @signature: node instance identifier * @link_id: local and remote bearer ids of changing link, if any * @peer_id: 128-bit ID of peer * @peer_id_string: ID string of peer * @publ_list: list of publications * @conn_sks: list of connections (FIXME) * @timer: node's keepalive timer * @keepalive_intv: keepalive interval in milliseconds * @rcu: rcu struct for tipc_node * @delete_at: indicates the time for deleting a down node * @peer_net: peer's net namespace * @peer_hash_mix: hash for this peer (FIXME) * @crypto_rx: RX crypto handler */ struct tipc_node { u32 addr; struct kref kref; rwlock_t lock; struct net *net; struct hlist_node hash; int active_links[2]; struct tipc_link_entry links[MAX_BEARERS]; struct tipc_bclink_entry bc_entry; int action_flags; struct list_head list; int state; bool preliminary; bool failover_sent; u16 sync_point; int link_cnt; u16 working_links; u16 capabilities; u32 signature; u32 link_id; u8 peer_id[16]; char peer_id_string[NODE_ID_STR_LEN]; struct list_head publ_list; struct list_head conn_sks; unsigned long keepalive_intv; struct timer_list timer; struct rcu_head rcu; unsigned long delete_at; struct net *peer_net; u32 peer_hash_mix; #ifdef CONFIG_TIPC_CRYPTO struct tipc_crypto *crypto_rx; #endif }; /* Node FSM states and events: */ enum { SELF_DOWN_PEER_DOWN = 0xdd, SELF_UP_PEER_UP = 0xaa, SELF_DOWN_PEER_LEAVING = 0xd1, SELF_UP_PEER_COMING = 0xac, SELF_COMING_PEER_UP = 0xca, SELF_LEAVING_PEER_DOWN = 0x1d, NODE_FAILINGOVER = 0xf0, NODE_SYNCHING = 0xcc }; enum { SELF_ESTABL_CONTACT_EVT = 0xece, SELF_LOST_CONTACT_EVT = 0x1ce, PEER_ESTABL_CONTACT_EVT = 0x9ece, PEER_LOST_CONTACT_EVT = 0x91ce, NODE_FAILOVER_BEGIN_EVT = 0xfbe, NODE_FAILOVER_END_EVT = 0xfee, NODE_SYNCH_BEGIN_EVT = 0xcbe, NODE_SYNCH_END_EVT = 0xcee }; static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, struct sk_buff_head *xmitq, struct tipc_media_addr **maddr); static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete); static void node_lost_contact(struct tipc_node *n, struct sk_buff_head *inputq); static void tipc_node_delete(struct tipc_node *node); static void tipc_node_timeout(struct timer_list *t); static void tipc_node_fsm_evt(struct tipc_node *n, int evt); static struct tipc_node *tipc_node_find(struct net *net, u32 addr); static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id); static bool node_is_up(struct tipc_node *n); static void tipc_node_delete_from_list(struct tipc_node *node); struct tipc_sock_conn { u32 port; u32 peer_port; u32 peer_node; struct list_head list; }; static struct tipc_link *node_active_link(struct tipc_node *n, int sel) { int bearer_id = n->active_links[sel & 1]; if (unlikely(bearer_id == INVALID_BEARER_ID)) return NULL; return n->links[bearer_id].link; } int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel, bool connected) { struct tipc_node *n; int bearer_id; unsigned int mtu = MAX_MSG_SIZE; n = tipc_node_find(net, addr); if (unlikely(!n)) return mtu; /* Allow MAX_MSG_SIZE when building connection oriented message * if they are in the same core network */ if (n->peer_net && connected) { tipc_node_put(n); return mtu; } bearer_id = n->active_links[sel & 1]; if (likely(bearer_id != INVALID_BEARER_ID)) mtu = n->links[bearer_id].mtu; tipc_node_put(n); return mtu; } bool tipc_node_get_id(struct net *net, u32 addr, u8 *id) { u8 *own_id = tipc_own_id(net); struct tipc_node *n; if (!own_id) return true; if (addr == tipc_own_addr(net)) { memcpy(id, own_id, TIPC_NODEID_LEN); return true; } n = tipc_node_find(net, addr); if (!n) return false; memcpy(id, &n->peer_id, TIPC_NODEID_LEN); tipc_node_put(n); return true; } u16 tipc_node_get_capabilities(struct net *net, u32 addr) { struct tipc_node *n; u16 caps; n = tipc_node_find(net, addr); if (unlikely(!n)) return TIPC_NODE_CAPABILITIES; caps = n->capabilities; tipc_node_put(n); return caps; } u32 tipc_node_get_addr(struct tipc_node *node) { return (node) ? node->addr : 0; } char *tipc_node_get_id_str(struct tipc_node *node) { return node->peer_id_string; } #ifdef CONFIG_TIPC_CRYPTO /** * tipc_node_crypto_rx - Retrieve crypto RX handle from node * @__n: target tipc_node * Note: node ref counter must be held first! */ struct tipc_crypto *tipc_node_crypto_rx(struct tipc_node *__n) { return (__n) ? __n->crypto_rx : NULL; } struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos) { return container_of(pos, struct tipc_node, list)->crypto_rx; } struct tipc_crypto *tipc_node_crypto_rx_by_addr(struct net *net, u32 addr) { struct tipc_node *n; n = tipc_node_find(net, addr); return (n) ? n->crypto_rx : NULL; } #endif static void tipc_node_free(struct rcu_head *rp) { struct tipc_node *n = container_of(rp, struct tipc_node, rcu); #ifdef CONFIG_TIPC_CRYPTO tipc_crypto_stop(&n->crypto_rx); #endif kfree(n); } static void tipc_node_kref_release(struct kref *kref) { struct tipc_node *n = container_of(kref, struct tipc_node, kref); kfree(n->bc_entry.link); call_rcu(&n->rcu, tipc_node_free); } void tipc_node_put(struct tipc_node *node) { kref_put(&node->kref, tipc_node_kref_release); } void tipc_node_get(struct tipc_node *node) { kref_get(&node->kref); } /* * tipc_node_find - locate specified node object, if it exists */ static struct tipc_node *tipc_node_find(struct net *net, u32 addr) { struct tipc_net *tn = tipc_net(net); struct tipc_node *node; unsigned int thash = tipc_hashfn(addr); rcu_read_lock(); hlist_for_each_entry_rcu(node, &tn->node_htable[thash], hash) { if (node->addr != addr || node->preliminary) continue; if (!kref_get_unless_zero(&node->kref)) node = NULL; break; } rcu_read_unlock(); return node; } /* tipc_node_find_by_id - locate specified node object by its 128-bit id * Note: this function is called only when a discovery request failed * to find the node by its 32-bit id, and is not time critical */ static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id) { struct tipc_net *tn = tipc_net(net); struct tipc_node *n; bool found = false; rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { read_lock_bh(&n->lock); if (!memcmp(id, n->peer_id, 16) && kref_get_unless_zero(&n->kref)) found = true; read_unlock_bh(&n->lock); if (found) break; } rcu_read_unlock(); return found ? n : NULL; } static void tipc_node_read_lock(struct tipc_node *n) __acquires(n->lock) { read_lock_bh(&n->lock); } static void tipc_node_read_unlock(struct tipc_node *n) __releases(n->lock) { read_unlock_bh(&n->lock); } static void tipc_node_write_lock(struct tipc_node *n) __acquires(n->lock) { write_lock_bh(&n->lock); } static void tipc_node_write_unlock_fast(struct tipc_node *n) __releases(n->lock) { write_unlock_bh(&n->lock); } static void tipc_node_write_unlock(struct tipc_node *n) __releases(n->lock) { struct tipc_socket_addr sk; struct net *net = n->net; u32 flags = n->action_flags; struct list_head *publ_list; struct tipc_uaddr ua; u32 bearer_id, node; if (likely(!flags)) { write_unlock_bh(&n->lock); return; } tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, TIPC_LINK_STATE, n->addr, n->addr); sk.ref = n->link_id; sk.node = tipc_own_addr(net); node = n->addr; bearer_id = n->link_id & 0xffff; publ_list = &n->publ_list; n->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP | TIPC_NOTIFY_LINK_DOWN | TIPC_NOTIFY_LINK_UP); write_unlock_bh(&n->lock); if (flags & TIPC_NOTIFY_NODE_DOWN) tipc_publ_notify(net, publ_list, node, n->capabilities); if (flags & TIPC_NOTIFY_NODE_UP) tipc_named_node_up(net, node, n->capabilities); if (flags & TIPC_NOTIFY_LINK_UP) { tipc_mon_peer_up(net, node, bearer_id); tipc_nametbl_publish(net, &ua, &sk, sk.ref); } if (flags & TIPC_NOTIFY_LINK_DOWN) { tipc_mon_peer_down(net, node, bearer_id); tipc_nametbl_withdraw(net, &ua, &sk, sk.ref); } } static void tipc_node_assign_peer_net(struct tipc_node *n, u32 hash_mixes) { int net_id = tipc_netid(n->net); struct tipc_net *tn_peer; struct net *tmp; u32 hash_chk; if (n->peer_net) return; for_each_net_rcu(tmp) { tn_peer = tipc_net(tmp); if (!tn_peer) continue; /* Integrity checking whether node exists in namespace or not */ if (tn_peer->net_id != net_id) continue; if (memcmp(n->peer_id, tn_peer->node_id, NODE_ID_LEN)) continue; hash_chk = tipc_net_hash_mixes(tmp, tn_peer->random); if (hash_mixes ^ hash_chk) continue; n->peer_net = tmp; n->peer_hash_mix = hash_mixes; break; } } struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, u16 capabilities, u32 hash_mixes, bool preliminary) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_link *l, *snd_l = tipc_bc_sndlink(net); struct tipc_node *n, *temp_node; unsigned long intv; int bearer_id; int i; spin_lock_bh(&tn->node_list_lock); n = tipc_node_find(net, addr) ?: tipc_node_find_by_id(net, peer_id); if (n) { if (!n->preliminary) goto update; if (preliminary) goto exit; /* A preliminary node becomes "real" now, refresh its data */ tipc_node_write_lock(n); if (!tipc_link_bc_create(net, tipc_own_addr(net), addr, peer_id, U16_MAX, tipc_link_min_win(snd_l), tipc_link_max_win(snd_l), n->capabilities, &n->bc_entry.inputq1, &n->bc_entry.namedq, snd_l, &n->bc_entry.link)) { pr_warn("Broadcast rcv link refresh failed, no memory\n"); tipc_node_write_unlock_fast(n); tipc_node_put(n); n = NULL; goto exit; } n->preliminary = false; n->addr = addr; hlist_del_rcu(&n->hash); hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); list_del_rcu(&n->list); list_for_each_entry_rcu(temp_node, &tn->node_list, list) { if (n->addr < temp_node->addr) break; } list_add_tail_rcu(&n->list, &temp_node->list); tipc_node_write_unlock_fast(n); update: if (n->peer_hash_mix ^ hash_mixes) tipc_node_assign_peer_net(n, hash_mixes); if (n->capabilities == capabilities) goto exit; /* Same node may come back with new capabilities */ tipc_node_write_lock(n); n->capabilities = capabilities; for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { l = n->links[bearer_id].link; if (l) tipc_link_update_caps(l, capabilities); } tipc_node_write_unlock_fast(n); /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); goto exit; } n = kzalloc(sizeof(*n), GFP_ATOMIC); if (!n) { pr_warn("Node creation failed, no memory\n"); goto exit; } tipc_nodeid2string(n->peer_id_string, peer_id); #ifdef CONFIG_TIPC_CRYPTO if (unlikely(tipc_crypto_start(&n->crypto_rx, net, n))) { pr_warn("Failed to start crypto RX(%s)!\n", n->peer_id_string); kfree(n); n = NULL; goto exit; } #endif n->addr = addr; n->preliminary = preliminary; memcpy(&n->peer_id, peer_id, 16); n->net = net; n->peer_net = NULL; n->peer_hash_mix = 0; /* Assign kernel local namespace if exists */ tipc_node_assign_peer_net(n, hash_mixes); n->capabilities = capabilities; kref_init(&n->kref); rwlock_init(&n->lock); INIT_HLIST_NODE(&n->hash); INIT_LIST_HEAD(&n->list); INIT_LIST_HEAD(&n->publ_list); INIT_LIST_HEAD(&n->conn_sks); skb_queue_head_init(&n->bc_entry.namedq); skb_queue_head_init(&n->bc_entry.inputq1); __skb_queue_head_init(&n->bc_entry.arrvq); skb_queue_head_init(&n->bc_entry.inputq2); for (i = 0; i < MAX_BEARERS; i++) spin_lock_init(&n->links[i].lock); n->state = SELF_DOWN_PEER_LEAVING; n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER); n->signature = INVALID_NODE_SIG; n->active_links[0] = INVALID_BEARER_ID; n->active_links[1] = INVALID_BEARER_ID; if (!preliminary && !tipc_link_bc_create(net, tipc_own_addr(net), addr, peer_id, U16_MAX, tipc_link_min_win(snd_l), tipc_link_max_win(snd_l), n->capabilities, &n->bc_entry.inputq1, &n->bc_entry.namedq, snd_l, &n->bc_entry.link)) { pr_warn("Broadcast rcv link creation failed, no memory\n"); tipc_node_put(n); n = NULL; goto exit; } tipc_node_get(n); timer_setup(&n->timer, tipc_node_timeout, 0); /* Start a slow timer anyway, crypto needs it */ n->keepalive_intv = 10000; intv = jiffies + msecs_to_jiffies(n->keepalive_intv); if (!mod_timer(&n->timer, intv)) tipc_node_get(n); hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); list_for_each_entry_rcu(temp_node, &tn->node_list, list) { if (n->addr < temp_node->addr) break; } list_add_tail_rcu(&n->list, &temp_node->list); /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); trace_tipc_node_create(n, true, " "); exit: spin_unlock_bh(&tn->node_list_lock); return n; } static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l) { unsigned long tol = tipc_link_tolerance(l); unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4; /* Link with lowest tolerance determines timer interval */ if (intv < n->keepalive_intv) n->keepalive_intv = intv; /* Ensure link's abort limit corresponds to current tolerance */ tipc_link_set_abort_limit(l, tol / n->keepalive_intv); } static void tipc_node_delete_from_list(struct tipc_node *node) { #ifdef CONFIG_TIPC_CRYPTO tipc_crypto_key_flush(node->crypto_rx); #endif list_del_rcu(&node->list); hlist_del_rcu(&node->hash); tipc_node_put(node); } static void tipc_node_delete(struct tipc_node *node) { trace_tipc_node_delete(node, true, " "); tipc_node_delete_from_list(node); del_timer_sync(&node->timer); tipc_node_put(node); } void tipc_node_stop(struct net *net) { struct tipc_net *tn = tipc_net(net); struct tipc_node *node, *t_node; spin_lock_bh(&tn->node_list_lock); list_for_each_entry_safe(node, t_node, &tn->node_list, list) tipc_node_delete(node); spin_unlock_bh(&tn->node_list_lock); } void tipc_node_subscribe(struct net *net, struct list_head *subscr, u32 addr) { struct tipc_node *n; if (in_own_node(net, addr)) return; n = tipc_node_find(net, addr); if (!n) { pr_warn("Node subscribe rejected, unknown node 0x%x\n", addr); return; } tipc_node_write_lock(n); list_add_tail(subscr, &n->publ_list); tipc_node_write_unlock_fast(n); tipc_node_put(n); } void tipc_node_unsubscribe(struct net *net, struct list_head *subscr, u32 addr) { struct tipc_node *n; if (in_own_node(net, addr)) return; n = tipc_node_find(net, addr); if (!n) { pr_warn("Node unsubscribe rejected, unknown node 0x%x\n", addr); return; } tipc_node_write_lock(n); list_del_init(subscr); tipc_node_write_unlock_fast(n); tipc_node_put(n); } int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port) { struct tipc_node *node; struct tipc_sock_conn *conn; int err = 0; if (in_own_node(net, dnode)) return 0; node = tipc_node_find(net, dnode); if (!node) { pr_warn("Connecting sock to node 0x%x failed\n", dnode); return -EHOSTUNREACH; } conn = kmalloc(sizeof(*conn), GFP_ATOMIC); if (!conn) { err = -EHOSTUNREACH; goto exit; } conn->peer_node = dnode; conn->port = port; conn->peer_port = peer_port; tipc_node_write_lock(node); list_add_tail(&conn->list, &node->conn_sks); tipc_node_write_unlock(node); exit: tipc_node_put(node); return err; } void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port) { struct tipc_node *node; struct tipc_sock_conn *conn, *safe; if (in_own_node(net, dnode)) return; node = tipc_node_find(net, dnode); if (!node) return; tipc_node_write_lock(node); list_for_each_entry_safe(conn, safe, &node->conn_sks, list) { if (port != conn->port) continue; list_del(&conn->list); kfree(conn); } tipc_node_write_unlock(node); tipc_node_put(node); } static void tipc_node_clear_links(struct tipc_node *node) { int i; for (i = 0; i < MAX_BEARERS; i++) { struct tipc_link_entry *le = &node->links[i]; if (le->link) { kfree(le->link); le->link = NULL; node->link_cnt--; } } } /* tipc_node_cleanup - delete nodes that does not * have active links for NODE_CLEANUP_AFTER time */ static bool tipc_node_cleanup(struct tipc_node *peer) { struct tipc_node *temp_node; struct tipc_net *tn = tipc_net(peer->net); bool deleted = false; /* If lock held by tipc_node_stop() the node will be deleted anyway */ if (!spin_trylock_bh(&tn->node_list_lock)) return false; tipc_node_write_lock(peer); if (!node_is_up(peer) && time_after(jiffies, peer->delete_at)) { tipc_node_clear_links(peer); tipc_node_delete_from_list(peer); deleted = true; } tipc_node_write_unlock(peer); if (!deleted) { spin_unlock_bh(&tn->node_list_lock); return deleted; } /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(peer->net, (tn->capabilities & TIPC_BCAST_RCAST)); spin_unlock_bh(&tn->node_list_lock); return deleted; } /* tipc_node_timeout - handle expiration of node timer */ static void tipc_node_timeout(struct timer_list *t) { struct tipc_node *n = from_timer(n, t, timer); struct tipc_link_entry *le; struct sk_buff_head xmitq; int remains = n->link_cnt; int bearer_id; int rc = 0; trace_tipc_node_timeout(n, false, " "); if (!node_is_up(n) && tipc_node_cleanup(n)) { /*Removing the reference of Timer*/ tipc_node_put(n); return; } #ifdef CONFIG_TIPC_CRYPTO /* Take any crypto key related actions first */ tipc_crypto_timeout(n->crypto_rx); #endif __skb_queue_head_init(&xmitq); /* Initial node interval to value larger (10 seconds), then it will be * recalculated with link lowest tolerance */ tipc_node_read_lock(n); n->keepalive_intv = 10000; tipc_node_read_unlock(n); for (bearer_id = 0; remains && (bearer_id < MAX_BEARERS); bearer_id++) { tipc_node_read_lock(n); le = &n->links[bearer_id]; if (le->link) { spin_lock_bh(&le->lock); /* Link tolerance may change asynchronously: */ tipc_node_calculate_timer(n, le->link); rc = tipc_link_timeout(le->link, &xmitq); spin_unlock_bh(&le->lock); remains--; } tipc_node_read_unlock(n); tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr, n); if (rc & TIPC_LINK_DOWN_EVT) tipc_node_link_down(n, bearer_id, false); } mod_timer(&n->timer, jiffies + msecs_to_jiffies(n->keepalive_intv)); } /** * __tipc_node_link_up - handle addition of link * @n: target tipc_node * @bearer_id: id of the bearer * @xmitq: queue for messages to be xmited on * Node lock must be held by caller * Link becomes active (alone or shared) or standby, depending on its priority. */ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, struct sk_buff_head *xmitq) { int *slot0 = &n->active_links[0]; int *slot1 = &n->active_links[1]; struct tipc_link *ol = node_active_link(n, 0); struct tipc_link *nl = n->links[bearer_id].link; if (!nl || tipc_link_is_up(nl)) return; tipc_link_fsm_evt(nl, LINK_ESTABLISH_EVT); if (!tipc_link_is_up(nl)) return; n->working_links++; n->action_flags |= TIPC_NOTIFY_LINK_UP; n->link_id = tipc_link_id(nl); /* Leave room for tunnel header when returning 'mtu' to users: */ n->links[bearer_id].mtu = tipc_link_mss(nl); tipc_bearer_add_dest(n->net, bearer_id, n->addr); tipc_bcast_inc_bearer_dst_cnt(n->net, bearer_id); pr_debug("Established link <%s> on network plane %c\n", tipc_link_name(nl), tipc_link_plane(nl)); trace_tipc_node_link_up(n, true, " "); /* Ensure that a STATE message goes first */ tipc_link_build_state_msg(nl, xmitq); /* First link? => give it both slots */ if (!ol) { *slot0 = bearer_id; *slot1 = bearer_id; tipc_node_fsm_evt(n, SELF_ESTABL_CONTACT_EVT); n->action_flags |= TIPC_NOTIFY_NODE_UP; tipc_link_set_active(nl, true); tipc_bcast_add_peer(n->net, nl, xmitq); return; } /* Second link => redistribute slots */ if (tipc_link_prio(nl) > tipc_link_prio(ol)) { pr_debug("Old link <%s> becomes standby\n", tipc_link_name(ol)); *slot0 = bearer_id; *slot1 = bearer_id; tipc_link_set_active(nl, true); tipc_link_set_active(ol, false); } else if (tipc_link_prio(nl) == tipc_link_prio(ol)) { tipc_link_set_active(nl, true); *slot1 = bearer_id; } else { pr_debug("New link <%s> is standby\n", tipc_link_name(nl)); } /* Prepare synchronization with first link */ tipc_link_tnl_prepare(ol, nl, SYNCH_MSG, xmitq); } /** * tipc_node_link_up - handle addition of link * @n: target tipc_node * @bearer_id: id of the bearer * @xmitq: queue for messages to be xmited on * * Link becomes active (alone or shared) or standby, depending on its priority. */ static void tipc_node_link_up(struct tipc_node *n, int bearer_id, struct sk_buff_head *xmitq) { struct tipc_media_addr *maddr; tipc_node_write_lock(n); __tipc_node_link_up(n, bearer_id, xmitq); maddr = &n->links[bearer_id].maddr; tipc_bearer_xmit(n->net, bearer_id, xmitq, maddr, n); tipc_node_write_unlock(n); } /** * tipc_node_link_failover() - start failover in case "half-failover" * * This function is only called in a very special situation where link * failover can be already started on peer node but not on this node. * This can happen when e.g.:: * * 1. Both links <1A-2A>, <1B-2B> down * 2. Link endpoint 2A up, but 1A still down (e.g. due to network * disturbance, wrong session, etc.) * 3. Link <1B-2B> up * 4. Link endpoint 2A down (e.g. due to link tolerance timeout) * 5. Node 2 starts failover onto link <1B-2B> * * ==> Node 1 does never start link/node failover! * * @n: tipc node structure * @l: link peer endpoint failingover (- can be NULL) * @tnl: tunnel link * @xmitq: queue for messages to be xmited on tnl link later */ static void tipc_node_link_failover(struct tipc_node *n, struct tipc_link *l, struct tipc_link *tnl, struct sk_buff_head *xmitq) { /* Avoid to be "self-failover" that can never end */ if (!tipc_link_is_up(tnl)) return; /* Don't rush, failure link may be in the process of resetting */ if (l && !tipc_link_is_reset(l)) return; tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); n->sync_point = tipc_link_rcv_nxt(tnl) + (U16_MAX / 2 - 1); tipc_link_failover_prepare(l, tnl, xmitq); if (l) tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); tipc_node_fsm_evt(n, NODE_FAILOVER_BEGIN_EVT); } /** * __tipc_node_link_down - handle loss of link * @n: target tipc_node * @bearer_id: id of the bearer * @xmitq: queue for messages to be xmited on * @maddr: output media address of the bearer */ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, struct sk_buff_head *xmitq, struct tipc_media_addr **maddr) { struct tipc_link_entry *le = &n->links[*bearer_id]; int *slot0 = &n->active_links[0]; int *slot1 = &n->active_links[1]; int i, highest = 0, prio; struct tipc_link *l, *_l, *tnl; l = n->links[*bearer_id].link; if (!l || tipc_link_is_reset(l)) return; n->working_links--; n->action_flags |= TIPC_NOTIFY_LINK_DOWN; n->link_id = tipc_link_id(l); tipc_bearer_remove_dest(n->net, *bearer_id, n->addr); pr_debug("Lost link <%s> on network plane %c\n", tipc_link_name(l), tipc_link_plane(l)); /* Select new active link if any available */ *slot0 = INVALID_BEARER_ID; *slot1 = INVALID_BEARER_ID; for (i = 0; i < MAX_BEARERS; i++) { _l = n->links[i].link; if (!_l || !tipc_link_is_up(_l)) continue; if (_l == l) continue; prio = tipc_link_prio(_l); if (prio < highest) continue; if (prio > highest) { highest = prio; *slot0 = i; *slot1 = i; continue; } *slot1 = i; } if (!node_is_up(n)) { if (tipc_link_peer_is_down(l)) tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT); tipc_node_fsm_evt(n, SELF_LOST_CONTACT_EVT); trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link down!"); tipc_link_fsm_evt(l, LINK_RESET_EVT); tipc_link_reset(l); tipc_link_build_reset_msg(l, xmitq); *maddr = &n->links[*bearer_id].maddr; node_lost_contact(n, &le->inputq); tipc_bcast_dec_bearer_dst_cnt(n->net, *bearer_id); return; } tipc_bcast_dec_bearer_dst_cnt(n->net, *bearer_id); /* There is still a working link => initiate failover */ *bearer_id = n->active_links[0]; tnl = n->links[*bearer_id].link; tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); n->sync_point = tipc_link_rcv_nxt(tnl) + (U16_MAX / 2 - 1); tipc_link_tnl_prepare(l, tnl, FAILOVER_MSG, xmitq); trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link down -> failover!"); tipc_link_reset(l); tipc_link_fsm_evt(l, LINK_RESET_EVT); tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); tipc_node_fsm_evt(n, NODE_FAILOVER_BEGIN_EVT); *maddr = &n->links[*bearer_id].maddr; } static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) { struct tipc_link_entry *le = &n->links[bearer_id]; struct tipc_media_addr *maddr = NULL; struct tipc_link *l = le->link; int old_bearer_id = bearer_id; struct sk_buff_head xmitq; if (!l) return; __skb_queue_head_init(&xmitq); tipc_node_write_lock(n); if (!tipc_link_is_establishing(l)) { __tipc_node_link_down(n, &bearer_id, &xmitq, &maddr); } else { /* Defuse pending tipc_node_link_up() */ tipc_link_reset(l); tipc_link_fsm_evt(l, LINK_RESET_EVT); } if (delete) { kfree(l); le->link = NULL; n->link_cnt--; } trace_tipc_node_link_down(n, true, "node link down or deleted!"); tipc_node_write_unlock(n); if (delete) tipc_mon_remove_peer(n->net, n->addr, old_bearer_id); if (!skb_queue_empty(&xmitq)) tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr, n); tipc_sk_rcv(n->net, &le->inputq); } static bool node_is_up(struct tipc_node *n) { return n->active_links[0] != INVALID_BEARER_ID; } bool tipc_node_is_up(struct net *net, u32 addr) { struct tipc_node *n; bool retval = false; if (in_own_node(net, addr)) return true; n = tipc_node_find(net, addr); if (!n) return false; retval = node_is_up(n); tipc_node_put(n); return retval; } static u32 tipc_node_suggest_addr(struct net *net, u32 addr) { struct tipc_node *n; addr ^= tipc_net(net)->random; while ((n = tipc_node_find(net, addr))) { tipc_node_put(n); addr++; } return addr; } /* tipc_node_try_addr(): Check if addr can be used by peer, suggest other if not * Returns suggested address if any, otherwise 0 */ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) { struct tipc_net *tn = tipc_net(net); struct tipc_node *n; bool preliminary; u32 sugg_addr; /* Suggest new address if some other peer is using this one */ n = tipc_node_find(net, addr); if (n) { if (!memcmp(n->peer_id, id, NODE_ID_LEN)) addr = 0; tipc_node_put(n); if (!addr) return 0; return tipc_node_suggest_addr(net, addr); } /* Suggest previously used address if peer is known */ n = tipc_node_find_by_id(net, id); if (n) { sugg_addr = n->addr; preliminary = n->preliminary; tipc_node_put(n); if (!preliminary) return sugg_addr; } /* Even this node may be in conflict */ if (tn->trial_addr == addr) return tipc_node_suggest_addr(net, addr); return 0; } void tipc_node_check_dest(struct net *net, u32 addr, u8 *peer_id, struct tipc_bearer *b, u16 capabilities, u32 signature, u32 hash_mixes, struct tipc_media_addr *maddr, bool *respond, bool *dupl_addr) { struct tipc_node *n; struct tipc_link *l; struct tipc_link_entry *le; bool addr_match = false; bool sign_match = false; bool link_up = false; bool link_is_reset = false; bool accept_addr = false; bool reset = false; char *if_name; unsigned long intv; u16 session; *dupl_addr = false; *respond = false; n = tipc_node_create(net, addr, peer_id, capabilities, hash_mixes, false); if (!n) return; tipc_node_write_lock(n); le = &n->links[b->identity]; /* Prepare to validate requesting node's signature and media address */ l = le->link; link_up = l && tipc_link_is_up(l); link_is_reset = l && tipc_link_is_reset(l); addr_match = l && !memcmp(&le->maddr, maddr, sizeof(*maddr)); sign_match = (signature == n->signature); /* These three flags give us eight permutations: */ if (sign_match && addr_match && link_up) { /* All is fine. Ignore requests. */ /* Peer node is not a container/local namespace */ if (!n->peer_hash_mix) n->peer_hash_mix = hash_mixes; } else if (sign_match && addr_match && !link_up) { /* Respond. The link will come up in due time */ *respond = true; } else if (sign_match && !addr_match && link_up) { /* Peer has changed i/f address without rebooting. * If so, the link will reset soon, and the next * discovery will be accepted. So we can ignore it. * It may also be a cloned or malicious peer having * chosen the same node address and signature as an * existing one. * Ignore requests until the link goes down, if ever. */ *dupl_addr = true; } else if (sign_match && !addr_match && !link_up) { /* Peer link has changed i/f address without rebooting. * It may also be a cloned or malicious peer; we can't * distinguish between the two. * The signature is correct, so we must accept. */ accept_addr = true; *respond = true; reset = true; } else if (!sign_match && addr_match && link_up) { /* Peer node rebooted. Two possibilities: * - Delayed re-discovery; this link endpoint has already * reset and re-established contact with the peer, before * receiving a discovery message from that node. * (The peer happened to receive one from this node first). * - The peer came back so fast that our side has not * discovered it yet. Probing from this side will soon * reset the link, since there can be no working link * endpoint at the peer end, and the link will re-establish. * Accept the signature, since it comes from a known peer. */ n->signature = signature; } else if (!sign_match && addr_match && !link_up) { /* The peer node has rebooted. * Accept signature, since it is a known peer. */ n->signature = signature; *respond = true; } else if (!sign_match && !addr_match && link_up) { /* Peer rebooted with new address, or a new/duplicate peer. * Ignore until the link goes down, if ever. */ *dupl_addr = true; } else if (!sign_match && !addr_match && !link_up) { /* Peer rebooted with new address, or it is a new peer. * Accept signature and address. */ n->signature = signature; accept_addr = true; *respond = true; reset = true; } if (!accept_addr) goto exit; /* Now create new link if not already existing */ if (!l) { if (n->link_cnt == 2) goto exit; if_name = strchr(b->name, ':') + 1; get_random_bytes(&session, sizeof(u16)); if (!tipc_link_create(net, if_name, b->identity, b->tolerance, b->net_plane, b->mtu, b->priority, b->min_win, b->max_win, session, tipc_own_addr(net), addr, peer_id, n->capabilities, tipc_bc_sndlink(n->net), n->bc_entry.link, &le->inputq, &n->bc_entry.namedq, &l)) { *respond = false; goto exit; } trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link created!"); tipc_link_reset(l); tipc_link_fsm_evt(l, LINK_RESET_EVT); if (n->state == NODE_FAILINGOVER) tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); link_is_reset = tipc_link_is_reset(l); le->link = l; n->link_cnt++; tipc_node_calculate_timer(n, l); if (n->link_cnt == 1) { intv = jiffies + msecs_to_jiffies(n->keepalive_intv); if (!mod_timer(&n->timer, intv)) tipc_node_get(n); } } memcpy(&le->maddr, maddr, sizeof(*maddr)); exit: tipc_node_write_unlock(n); if (reset && !link_is_reset) tipc_node_link_down(n, b->identity, false); tipc_node_put(n); } void tipc_node_delete_links(struct net *net, int bearer_id) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *n; rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { tipc_node_link_down(n, bearer_id, true); } rcu_read_unlock(); } static void tipc_node_reset_links(struct tipc_node *n) { int i; pr_warn("Resetting all links to %x\n", n->addr); trace_tipc_node_reset_links(n, true, " "); for (i = 0; i < MAX_BEARERS; i++) { tipc_node_link_down(n, i, false); } } /* tipc_node_fsm_evt - node finite state machine * Determines when contact is allowed with peer node */ static void tipc_node_fsm_evt(struct tipc_node *n, int evt) { int state = n->state; switch (state) { case SELF_DOWN_PEER_DOWN: switch (evt) { case SELF_ESTABL_CONTACT_EVT: state = SELF_UP_PEER_COMING; break; case PEER_ESTABL_CONTACT_EVT: state = SELF_COMING_PEER_UP; break; case SELF_LOST_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_UP_PEER_UP: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_LEAVING; break; case PEER_LOST_CONTACT_EVT: state = SELF_LEAVING_PEER_DOWN; break; case NODE_SYNCH_BEGIN_EVT: state = NODE_SYNCHING; break; case NODE_FAILOVER_BEGIN_EVT: state = NODE_FAILINGOVER; break; case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: case NODE_SYNCH_END_EVT: case NODE_FAILOVER_END_EVT: break; default: goto illegal_evt; } break; case SELF_DOWN_PEER_LEAVING: switch (evt) { case PEER_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: case SELF_LOST_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_UP_PEER_COMING: switch (evt) { case PEER_ESTABL_CONTACT_EVT: state = SELF_UP_PEER_UP; break; case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_ESTABL_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: case NODE_SYNCH_END_EVT: case NODE_FAILOVER_BEGIN_EVT: break; case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_COMING_PEER_UP: switch (evt) { case SELF_ESTABL_CONTACT_EVT: state = SELF_UP_PEER_UP; break; case PEER_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_LOST_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_LEAVING_PEER_DOWN: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case NODE_FAILINGOVER: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_LEAVING; break; case PEER_LOST_CONTACT_EVT: state = SELF_LEAVING_PEER_DOWN; break; case NODE_FAILOVER_END_EVT: state = SELF_UP_PEER_UP; break; case NODE_FAILOVER_BEGIN_EVT: case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: break; case NODE_SYNCH_BEGIN_EVT: case NODE_SYNCH_END_EVT: default: goto illegal_evt; } break; case NODE_SYNCHING: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_LEAVING; break; case PEER_LOST_CONTACT_EVT: state = SELF_LEAVING_PEER_DOWN; break; case NODE_SYNCH_END_EVT: state = SELF_UP_PEER_UP; break; case NODE_FAILOVER_BEGIN_EVT: state = NODE_FAILINGOVER; break; case NODE_SYNCH_BEGIN_EVT: case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: break; case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; default: pr_err("Unknown node fsm state %x\n", state); break; } trace_tipc_node_fsm(n->peer_id, n->state, state, evt); n->state = state; return; illegal_evt: pr_err("Illegal node fsm evt %x in state %x\n", evt, state); trace_tipc_node_fsm(n->peer_id, n->state, state, evt); } static void node_lost_contact(struct tipc_node *n, struct sk_buff_head *inputq) { struct tipc_sock_conn *conn, *safe; struct tipc_link *l; struct list_head *conns = &n->conn_sks; struct sk_buff *skb; uint i; pr_debug("Lost contact with %x\n", n->addr); n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER); trace_tipc_node_lost_contact(n, true, " "); /* Clean up broadcast state */ tipc_bcast_remove_peer(n->net, n->bc_entry.link); skb_queue_purge(&n->bc_entry.namedq); /* Abort any ongoing link failover */ for (i = 0; i < MAX_BEARERS; i++) { l = n->links[i].link; if (l) tipc_link_fsm_evt(l, LINK_FAILOVER_END_EVT); } /* Notify publications from this node */ n->action_flags |= TIPC_NOTIFY_NODE_DOWN; n->peer_net = NULL; n->peer_hash_mix = 0; /* Notify sockets connected to node */ list_for_each_entry_safe(conn, safe, conns, list) { skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, SHORT_H_SIZE, 0, tipc_own_addr(n->net), conn->peer_node, conn->port, conn->peer_port, TIPC_ERR_NO_NODE); if (likely(skb)) skb_queue_tail(inputq, skb); list_del(&conn->list); kfree(conn); } } /** * tipc_node_get_linkname - get the name of a link * * @net: the applicable net namespace * @bearer_id: id of the bearer * @addr: peer node address * @linkname: link name output buffer * @len: size of @linkname output buffer * * Return: 0 on success */ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr, char *linkname, size_t len) { struct tipc_link *link; int err = -EINVAL; struct tipc_node *node = tipc_node_find(net, addr); if (!node) return err; if (bearer_id >= MAX_BEARERS) goto exit; tipc_node_read_lock(node); link = node->links[bearer_id].link; if (link) { strncpy(linkname, tipc_link_name(link), len); err = 0; } tipc_node_read_unlock(node); exit: tipc_node_put(node); return err; } /* Caller should hold node lock for the passed node */ static int __tipc_nl_add_node(struct tipc_nl_msg *msg, struct tipc_node *node) { void *hdr; struct nlattr *attrs; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_NODE_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_NODE); if (!attrs) goto msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_NODE_ADDR, node->addr)) goto attr_msg_full; if (node_is_up(node)) if (nla_put_flag(msg->skb, TIPC_NLA_NODE_UP)) goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } static void tipc_lxc_xmit(struct net *peer_net, struct sk_buff_head *list) { struct tipc_msg *hdr = buf_msg(skb_peek(list)); struct sk_buff_head inputq; switch (msg_user(hdr)) { case TIPC_LOW_IMPORTANCE: case TIPC_MEDIUM_IMPORTANCE: case TIPC_HIGH_IMPORTANCE: case TIPC_CRITICAL_IMPORTANCE: if (msg_connected(hdr) || msg_named(hdr) || msg_direct(hdr)) { tipc_loopback_trace(peer_net, list); spin_lock_init(&list->lock); tipc_sk_rcv(peer_net, list); return; } if (msg_mcast(hdr)) { tipc_loopback_trace(peer_net, list); skb_queue_head_init(&inputq); tipc_sk_mcast_rcv(peer_net, list, &inputq); __skb_queue_purge(list); skb_queue_purge(&inputq); return; } return; case MSG_FRAGMENTER: if (tipc_msg_assemble(list)) { tipc_loopback_trace(peer_net, list); skb_queue_head_init(&inputq); tipc_sk_mcast_rcv(peer_net, list, &inputq); __skb_queue_purge(list); skb_queue_purge(&inputq); } return; case GROUP_PROTOCOL: case CONN_MANAGER: tipc_loopback_trace(peer_net, list); spin_lock_init(&list->lock); tipc_sk_rcv(peer_net, list); return; case LINK_PROTOCOL: case NAME_DISTRIBUTOR: case TUNNEL_PROTOCOL: case BCAST_PROTOCOL: return; default: return; } } /** * tipc_node_xmit() - general link level function for message sending * @net: the applicable net namespace * @list: chain of buffers containing message * @dnode: address of destination node * @selector: a number used for deterministic link selection * Consumes the buffer chain. * Return: 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF */ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, int selector) { struct tipc_link_entry *le = NULL; struct tipc_node *n; struct sk_buff_head xmitq; bool node_up = false; struct net *peer_net; int bearer_id; int rc; if (in_own_node(net, dnode)) { tipc_loopback_trace(net, list); spin_lock_init(&list->lock); tipc_sk_rcv(net, list); return 0; } n = tipc_node_find(net, dnode); if (unlikely(!n)) { __skb_queue_purge(list); return -EHOSTUNREACH; } rcu_read_lock(); tipc_node_read_lock(n); node_up = node_is_up(n); peer_net = n->peer_net; tipc_node_read_unlock(n); if (node_up && peer_net && check_net(peer_net)) { /* xmit inner linux container */ tipc_lxc_xmit(peer_net, list); if (likely(skb_queue_empty(list))) { rcu_read_unlock(); tipc_node_put(n); return 0; } } rcu_read_unlock(); tipc_node_read_lock(n); bearer_id = n->active_links[selector & 1]; if (unlikely(bearer_id == INVALID_BEARER_ID)) { tipc_node_read_unlock(n); tipc_node_put(n); __skb_queue_purge(list); return -EHOSTUNREACH; } __skb_queue_head_init(&xmitq); le = &n->links[bearer_id]; spin_lock_bh(&le->lock); rc = tipc_link_xmit(le->link, list, &xmitq); spin_unlock_bh(&le->lock); tipc_node_read_unlock(n); if (unlikely(rc == -ENOBUFS)) tipc_node_link_down(n, bearer_id, false); else tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); tipc_node_put(n); return rc; } /* tipc_node_xmit_skb(): send single buffer to destination * Buffers sent via this function are generally TIPC_SYSTEM_IMPORTANCE * messages, which will not be rejected * The only exception is datagram messages rerouted after secondary * lookup, which are rare and safe to dispose of anyway. */ int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, u32 selector) { struct sk_buff_head head; __skb_queue_head_init(&head); __skb_queue_tail(&head, skb); tipc_node_xmit(net, &head, dnode, selector); return 0; } /* tipc_node_distr_xmit(): send single buffer msgs to individual destinations * Note: this is only for SYSTEM_IMPORTANCE messages, which cannot be rejected */ int tipc_node_distr_xmit(struct net *net, struct sk_buff_head *xmitq) { struct sk_buff *skb; u32 selector, dnode; while ((skb = __skb_dequeue(xmitq))) { selector = msg_origport(buf_msg(skb)); dnode = msg_destnode(buf_msg(skb)); tipc_node_xmit_skb(net, skb, dnode, selector); } return 0; } void tipc_node_broadcast(struct net *net, struct sk_buff *skb, int rc_dests) { struct sk_buff_head xmitq; struct sk_buff *txskb; struct tipc_node *n; u16 dummy; u32 dst; /* Use broadcast if all nodes support it */ if (!rc_dests && tipc_bcast_get_mode(net) != BCLINK_MODE_RCAST) { __skb_queue_head_init(&xmitq); __skb_queue_tail(&xmitq, skb); tipc_bcast_xmit(net, &xmitq, &dummy); return; } /* Otherwise use legacy replicast method */ rcu_read_lock(); list_for_each_entry_rcu(n, tipc_nodes(net), list) { dst = n->addr; if (in_own_node(net, dst)) continue; if (!node_is_up(n)) continue; txskb = pskb_copy(skb, GFP_ATOMIC); if (!txskb) break; msg_set_destnode(buf_msg(txskb), dst); tipc_node_xmit_skb(net, txskb, dst, 0); } rcu_read_unlock(); kfree_skb(skb); } static void tipc_node_mcast_rcv(struct tipc_node *n) { struct tipc_bclink_entry *be = &n->bc_entry; /* 'arrvq' is under inputq2's lock protection */ spin_lock_bh(&be->inputq2.lock); spin_lock_bh(&be->inputq1.lock); skb_queue_splice_tail_init(&be->inputq1, &be->arrvq); spin_unlock_bh(&be->inputq1.lock); spin_unlock_bh(&be->inputq2.lock); tipc_sk_mcast_rcv(n->net, &be->arrvq, &be->inputq2); } static void tipc_node_bc_sync_rcv(struct tipc_node *n, struct tipc_msg *hdr, int bearer_id, struct sk_buff_head *xmitq) { struct tipc_link *ucl; int rc; rc = tipc_bcast_sync_rcv(n->net, n->bc_entry.link, hdr, xmitq); if (rc & TIPC_LINK_DOWN_EVT) { tipc_node_reset_links(n); return; } if (!(rc & TIPC_LINK_SND_STATE)) return; /* If probe message, a STATE response will be sent anyway */ if (msg_probe(hdr)) return; /* Produce a STATE message carrying broadcast NACK */ tipc_node_read_lock(n); ucl = n->links[bearer_id].link; if (ucl) tipc_link_build_state_msg(ucl, xmitq); tipc_node_read_unlock(n); } /** * tipc_node_bc_rcv - process TIPC broadcast packet arriving from off-node * @net: the applicable net namespace * @skb: TIPC packet * @bearer_id: id of bearer message arrived on * * Invoked with no locks held. */ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id) { int rc; struct sk_buff_head xmitq; struct tipc_bclink_entry *be; struct tipc_link_entry *le; struct tipc_msg *hdr = buf_msg(skb); int usr = msg_user(hdr); u32 dnode = msg_destnode(hdr); struct tipc_node *n; __skb_queue_head_init(&xmitq); /* If NACK for other node, let rcv link for that node peek into it */ if ((usr == BCAST_PROTOCOL) && (dnode != tipc_own_addr(net))) n = tipc_node_find(net, dnode); else n = tipc_node_find(net, msg_prevnode(hdr)); if (!n) { kfree_skb(skb); return; } be = &n->bc_entry; le = &n->links[bearer_id]; rc = tipc_bcast_rcv(net, be->link, skb); /* Broadcast ACKs are sent on a unicast link */ if (rc & TIPC_LINK_SND_STATE) { tipc_node_read_lock(n); tipc_link_build_state_msg(le->link, &xmitq); tipc_node_read_unlock(n); } if (!skb_queue_empty(&xmitq)) tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); if (!skb_queue_empty(&be->inputq1)) tipc_node_mcast_rcv(n); /* Handle NAME_DISTRIBUTOR messages sent from 1.7 nodes */ if (!skb_queue_empty(&n->bc_entry.namedq)) tipc_named_rcv(net, &n->bc_entry.namedq, &n->bc_entry.named_rcv_nxt, &n->bc_entry.named_open); /* If reassembly or retransmission failure => reset all links to peer */ if (rc & TIPC_LINK_DOWN_EVT) tipc_node_reset_links(n); tipc_node_put(n); } /** * tipc_node_check_state - check and if necessary update node state * @n: target tipc_node * @skb: TIPC packet * @bearer_id: identity of bearer delivering the packet * @xmitq: queue for messages to be xmited on * Return: true if state and msg are ok, otherwise false */ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, int bearer_id, struct sk_buff_head *xmitq) { struct tipc_msg *hdr = buf_msg(skb); int usr = msg_user(hdr); int mtyp = msg_type(hdr); u16 oseqno = msg_seqno(hdr); u16 exp_pkts = msg_msgcnt(hdr); u16 rcv_nxt, syncpt, dlv_nxt, inputq_len; int state = n->state; struct tipc_link *l, *tnl, *pl = NULL; struct tipc_media_addr *maddr; int pb_id; if (trace_tipc_node_check_state_enabled()) { trace_tipc_skb_dump(skb, false, "skb for node state check"); trace_tipc_node_check_state(n, true, " "); } l = n->links[bearer_id].link; if (!l) return false; rcv_nxt = tipc_link_rcv_nxt(l); if (likely((state == SELF_UP_PEER_UP) && (usr != TUNNEL_PROTOCOL))) return true; /* Find parallel link, if any */ for (pb_id = 0; pb_id < MAX_BEARERS; pb_id++) { if ((pb_id != bearer_id) && n->links[pb_id].link) { pl = n->links[pb_id].link; break; } } if (!tipc_link_validate_msg(l, hdr)) { trace_tipc_skb_dump(skb, false, "PROTO invalid (2)!"); trace_tipc_link_dump(l, TIPC_DUMP_NONE, "PROTO invalid (2)!"); return false; } /* Check and update node accesibility if applicable */ if (state == SELF_UP_PEER_COMING) { if (!tipc_link_is_up(l)) return true; if (!msg_peer_link_is_up(hdr)) return true; tipc_node_fsm_evt(n, PEER_ESTABL_CONTACT_EVT); } if (state == SELF_DOWN_PEER_LEAVING) { if (msg_peer_node_is_up(hdr)) return false; tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT); return true; } if (state == SELF_LEAVING_PEER_DOWN) return false; /* Ignore duplicate packets */ if ((usr != LINK_PROTOCOL) && less(oseqno, rcv_nxt)) return true; /* Initiate or update failover mode if applicable */ if ((usr == TUNNEL_PROTOCOL) && (mtyp == FAILOVER_MSG)) { syncpt = oseqno + exp_pkts - 1; if (pl && !tipc_link_is_reset(pl)) { __tipc_node_link_down(n, &pb_id, xmitq, &maddr); trace_tipc_node_link_down(n, true, "node link down <- failover!"); tipc_skb_queue_splice_tail_init(tipc_link_inputq(pl), tipc_link_inputq(l)); } /* If parallel link was already down, and this happened before * the tunnel link came up, node failover was never started. * Ensure that a FAILOVER_MSG is sent to get peer out of * NODE_FAILINGOVER state, also this node must accept * TUNNEL_MSGs from peer. */ if (n->state != NODE_FAILINGOVER) tipc_node_link_failover(n, pl, l, xmitq); /* If pkts arrive out of order, use lowest calculated syncpt */ if (less(syncpt, n->sync_point)) n->sync_point = syncpt; } /* Open parallel link when tunnel link reaches synch point */ if ((n->state == NODE_FAILINGOVER) && tipc_link_is_up(l)) { if (!more(rcv_nxt, n->sync_point)) return true; tipc_node_fsm_evt(n, NODE_FAILOVER_END_EVT); if (pl) tipc_link_fsm_evt(pl, LINK_FAILOVER_END_EVT); return true; } /* No syncing needed if only one link */ if (!pl || !tipc_link_is_up(pl)) return true; /* Initiate synch mode if applicable */ if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG) && (oseqno == 1)) { if (n->capabilities & TIPC_TUNNEL_ENHANCED) syncpt = msg_syncpt(hdr); else syncpt = msg_seqno(msg_inner_hdr(hdr)) + exp_pkts - 1; if (!tipc_link_is_up(l)) __tipc_node_link_up(n, bearer_id, xmitq); if (n->state == SELF_UP_PEER_UP) { n->sync_point = syncpt; tipc_link_fsm_evt(l, LINK_SYNCH_BEGIN_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_BEGIN_EVT); } } /* Open tunnel link when parallel link reaches synch point */ if (n->state == NODE_SYNCHING) { if (tipc_link_is_synching(l)) { tnl = l; } else { tnl = pl; pl = l; } inputq_len = skb_queue_len(tipc_link_inputq(pl)); dlv_nxt = tipc_link_rcv_nxt(pl) - inputq_len; if (more(dlv_nxt, n->sync_point)) { tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); return true; } if (l == pl) return true; if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG)) return true; if (usr == LINK_PROTOCOL) return true; return false; } return true; } /** * tipc_rcv - process TIPC packets/messages arriving from off-node * @net: the applicable net namespace * @skb: TIPC packet * @b: pointer to bearer message arrived on * * Invoked with no locks held. Bearer pointer must point to a valid bearer * structure (i.e. cannot be NULL), but bearer can be inactive. */ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) { struct sk_buff_head xmitq; struct tipc_link_entry *le; struct tipc_msg *hdr; struct tipc_node *n; int bearer_id = b->identity; u32 self = tipc_own_addr(net); int usr, rc = 0; u16 bc_ack; #ifdef CONFIG_TIPC_CRYPTO struct tipc_ehdr *ehdr; /* Check if message must be decrypted first */ if (TIPC_SKB_CB(skb)->decrypted || !tipc_ehdr_validate(skb)) goto rcv; ehdr = (struct tipc_ehdr *)skb->data; if (likely(ehdr->user != LINK_CONFIG)) { n = tipc_node_find(net, ntohl(ehdr->addr)); if (unlikely(!n)) goto discard; } else { n = tipc_node_find_by_id(net, ehdr->id); } tipc_crypto_rcv(net, (n) ? n->crypto_rx : NULL, &skb, b); if (!skb) return; rcv: #endif /* Ensure message is well-formed before touching the header */ if (unlikely(!tipc_msg_validate(&skb))) goto discard; __skb_queue_head_init(&xmitq); hdr = buf_msg(skb); usr = msg_user(hdr); bc_ack = msg_bcast_ack(hdr); /* Handle arrival of discovery or broadcast packet */ if (unlikely(msg_non_seq(hdr))) { if (unlikely(usr == LINK_CONFIG)) return tipc_disc_rcv(net, skb, b); else return tipc_node_bc_rcv(net, skb, bearer_id); } /* Discard unicast link messages destined for another node */ if (unlikely(!msg_short(hdr) && (msg_destnode(hdr) != self))) goto discard; /* Locate neighboring node that sent packet */ n = tipc_node_find(net, msg_prevnode(hdr)); if (unlikely(!n)) goto discard; le = &n->links[bearer_id]; /* Ensure broadcast reception is in synch with peer's send state */ if (unlikely(usr == LINK_PROTOCOL)) { if (unlikely(skb_linearize(skb))) { tipc_node_put(n); goto discard; } hdr = buf_msg(skb); tipc_node_bc_sync_rcv(n, hdr, bearer_id, &xmitq); } else if (unlikely(tipc_link_acked(n->bc_entry.link) != bc_ack)) { tipc_bcast_ack_rcv(net, n->bc_entry.link, hdr); } /* Receive packet directly if conditions permit */ tipc_node_read_lock(n); if (likely((n->state == SELF_UP_PEER_UP) && (usr != TUNNEL_PROTOCOL))) { spin_lock_bh(&le->lock); if (le->link) { rc = tipc_link_rcv(le->link, skb, &xmitq); skb = NULL; } spin_unlock_bh(&le->lock); } tipc_node_read_unlock(n); /* Check/update node state before receiving */ if (unlikely(skb)) { if (unlikely(skb_linearize(skb))) goto out_node_put; tipc_node_write_lock(n); if (tipc_node_check_state(n, skb, bearer_id, &xmitq)) { if (le->link) { rc = tipc_link_rcv(le->link, skb, &xmitq); skb = NULL; } } tipc_node_write_unlock(n); } if (unlikely(rc & TIPC_LINK_UP_EVT)) tipc_node_link_up(n, bearer_id, &xmitq); if (unlikely(rc & TIPC_LINK_DOWN_EVT)) tipc_node_link_down(n, bearer_id, false); if (unlikely(!skb_queue_empty(&n->bc_entry.namedq))) tipc_named_rcv(net, &n->bc_entry.namedq, &n->bc_entry.named_rcv_nxt, &n->bc_entry.named_open); if (unlikely(!skb_queue_empty(&n->bc_entry.inputq1))) tipc_node_mcast_rcv(n); if (!skb_queue_empty(&le->inputq)) tipc_sk_rcv(net, &le->inputq); if (!skb_queue_empty(&xmitq)) tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); out_node_put: tipc_node_put(n); discard: kfree_skb(skb); } void tipc_node_apply_property(struct net *net, struct tipc_bearer *b, int prop) { struct tipc_net *tn = tipc_net(net); int bearer_id = b->identity; struct sk_buff_head xmitq; struct tipc_link_entry *e; struct tipc_node *n; __skb_queue_head_init(&xmitq); rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { tipc_node_write_lock(n); e = &n->links[bearer_id]; if (e->link) { if (prop == TIPC_NLA_PROP_TOL) tipc_link_set_tolerance(e->link, b->tolerance, &xmitq); else if (prop == TIPC_NLA_PROP_MTU) tipc_link_set_mtu(e->link, b->mtu); /* Update MTU for node link entry */ e->mtu = tipc_link_mss(e->link); } tipc_node_write_unlock(n); tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr, NULL); } rcu_read_unlock(); } int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_net *tn = net_generic(net, tipc_net_id); struct nlattr *attrs[TIPC_NLA_NET_MAX + 1]; struct tipc_node *peer, *temp_node; u8 node_id[NODE_ID_LEN]; u64 *w0 = (u64 *)&node_id[0]; u64 *w1 = (u64 *)&node_id[8]; u32 addr; int err; /* We identify the peer by its net */ if (!info->attrs[TIPC_NLA_NET]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_NET_MAX, info->attrs[TIPC_NLA_NET], tipc_nl_net_policy, info->extack); if (err) return err; /* attrs[TIPC_NLA_NET_NODEID] and attrs[TIPC_NLA_NET_ADDR] are * mutually exclusive cases */ if (attrs[TIPC_NLA_NET_ADDR]) { addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]); if (!addr) return -EINVAL; } if (attrs[TIPC_NLA_NET_NODEID]) { if (!attrs[TIPC_NLA_NET_NODEID_W1]) return -EINVAL; *w0 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID]); *w1 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID_W1]); addr = hash128to32(node_id); } if (in_own_node(net, addr)) return -ENOTSUPP; spin_lock_bh(&tn->node_list_lock); peer = tipc_node_find(net, addr); if (!peer) { spin_unlock_bh(&tn->node_list_lock); return -ENXIO; } tipc_node_write_lock(peer); if (peer->state != SELF_DOWN_PEER_DOWN && peer->state != SELF_DOWN_PEER_LEAVING) { tipc_node_write_unlock(peer); err = -EBUSY; goto err_out; } tipc_node_clear_links(peer); tipc_node_write_unlock(peer); tipc_node_delete(peer); /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); err = 0; err_out: tipc_node_put(peer); spin_unlock_bh(&tn->node_list_lock); return err; } int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb) { int err; struct net *net = sock_net(skb->sk); struct tipc_net *tn = net_generic(net, tipc_net_id); int done = cb->args[0]; int last_addr = cb->args[1]; struct tipc_node *node; struct tipc_nl_msg msg; if (done) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); if (last_addr) { node = tipc_node_find(net, last_addr); if (!node) { rcu_read_unlock(); /* We never set seq or call nl_dump_check_consistent() * this means that setting prev_seq here will cause the * consistence check to fail in the netlink callback * handler. Resulting in the NLMSG_DONE message having * the NLM_F_DUMP_INTR flag set if the node state * changed while we released the lock. */ cb->prev_seq = 1; return -EPIPE; } tipc_node_put(node); } list_for_each_entry_rcu(node, &tn->node_list, list) { if (node->preliminary) continue; if (last_addr) { if (node->addr == last_addr) last_addr = 0; else continue; } tipc_node_read_lock(node); err = __tipc_nl_add_node(&msg, node); if (err) { last_addr = node->addr; tipc_node_read_unlock(node); goto out; } tipc_node_read_unlock(node); } done = 1; out: cb->args[0] = done; cb->args[1] = last_addr; rcu_read_unlock(); return skb->len; } /* tipc_node_find_by_name - locate owner node of link by link's name * @net: the applicable net namespace * @name: pointer to link name string * @bearer_id: pointer to index in 'node->links' array where the link was found. * * Returns pointer to node owning the link, or 0 if no matching link is found. */ static struct tipc_node *tipc_node_find_by_name(struct net *net, const char *link_name, unsigned int *bearer_id) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_link *l; struct tipc_node *n; struct tipc_node *found_node = NULL; int i; *bearer_id = 0; rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { tipc_node_read_lock(n); for (i = 0; i < MAX_BEARERS; i++) { l = n->links[i].link; if (l && !strcmp(tipc_link_name(l), link_name)) { *bearer_id = i; found_node = n; break; } } tipc_node_read_unlock(n); if (found_node) break; } rcu_read_unlock(); return found_node; } int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) { int err; int res = 0; int bearer_id; char *name; struct tipc_link *link; struct tipc_node *node; struct sk_buff_head xmitq; struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; struct net *net = sock_net(skb->sk); __skb_queue_head_init(&xmitq); if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX, info->attrs[TIPC_NLA_LINK], tipc_nl_link_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_LINK_NAME]) return -EINVAL; name = nla_data(attrs[TIPC_NLA_LINK_NAME]); if (strcmp(name, tipc_bclink_name) == 0) return tipc_nl_bc_link_set(net, attrs); node = tipc_node_find_by_name(net, name, &bearer_id); if (!node) return -EINVAL; tipc_node_read_lock(node); link = node->links[bearer_id].link; if (!link) { res = -EINVAL; goto out; } if (attrs[TIPC_NLA_LINK_PROP]) { struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP], props); if (err) { res = err; goto out; } if (props[TIPC_NLA_PROP_TOL]) { u32 tol; tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]); tipc_link_set_tolerance(link, tol, &xmitq); } if (props[TIPC_NLA_PROP_PRIO]) { u32 prio; prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); tipc_link_set_prio(link, prio, &xmitq); } if (props[TIPC_NLA_PROP_WIN]) { u32 max_win; max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); tipc_link_set_queue_limits(link, tipc_link_min_win(link), max_win); } } out: tipc_node_read_unlock(node); tipc_bearer_xmit(net, bearer_id, &xmitq, &node->links[bearer_id].maddr, NULL); return res; } int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; struct tipc_nl_msg msg; char *name; int err; msg.portid = info->snd_portid; msg.seq = info->snd_seq; if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX, info->attrs[TIPC_NLA_LINK], tipc_nl_link_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_LINK_NAME]) return -EINVAL; name = nla_data(attrs[TIPC_NLA_LINK_NAME]); msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!msg.skb) return -ENOMEM; if (strcmp(name, tipc_bclink_name) == 0) { err = tipc_nl_add_bc_link(net, &msg, tipc_net(net)->bcl); if (err) goto err_free; } else { int bearer_id; struct tipc_node *node; struct tipc_link *link; node = tipc_node_find_by_name(net, name, &bearer_id); if (!node) { err = -EINVAL; goto err_free; } tipc_node_read_lock(node); link = node->links[bearer_id].link; if (!link) { tipc_node_read_unlock(node); err = -EINVAL; goto err_free; } err = __tipc_nl_add_link(net, &msg, link, 0); tipc_node_read_unlock(node); if (err) goto err_free; } return genlmsg_reply(msg.skb, info); err_free: nlmsg_free(msg.skb); return err; } int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info) { int err; char *link_name; unsigned int bearer_id; struct tipc_link *link; struct tipc_node *node; struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; struct net *net = sock_net(skb->sk); struct tipc_net *tn = tipc_net(net); struct tipc_link_entry *le; if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX, info->attrs[TIPC_NLA_LINK], tipc_nl_link_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_LINK_NAME]) return -EINVAL; link_name = nla_data(attrs[TIPC_NLA_LINK_NAME]); err = -EINVAL; if (!strcmp(link_name, tipc_bclink_name)) { err = tipc_bclink_reset_stats(net, tipc_bc_sndlink(net)); if (err) return err; return 0; } else if (strstr(link_name, tipc_bclink_name)) { rcu_read_lock(); list_for_each_entry_rcu(node, &tn->node_list, list) { tipc_node_read_lock(node); link = node->bc_entry.link; if (link && !strcmp(link_name, tipc_link_name(link))) { err = tipc_bclink_reset_stats(net, link); tipc_node_read_unlock(node); break; } tipc_node_read_unlock(node); } rcu_read_unlock(); return err; } node = tipc_node_find_by_name(net, link_name, &bearer_id); if (!node) return -EINVAL; le = &node->links[bearer_id]; tipc_node_read_lock(node); spin_lock_bh(&le->lock); link = node->links[bearer_id].link; if (!link) { spin_unlock_bh(&le->lock); tipc_node_read_unlock(node); return -EINVAL; } tipc_link_reset_stats(link); spin_unlock_bh(&le->lock); tipc_node_read_unlock(node); return 0; } /* Caller should hold node lock */ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg, struct tipc_node *node, u32 *prev_link, bool bc_link) { u32 i; int err; for (i = *prev_link; i < MAX_BEARERS; i++) { *prev_link = i; if (!node->links[i].link) continue; err = __tipc_nl_add_link(net, msg, node->links[i].link, NLM_F_MULTI); if (err) return err; } if (bc_link) { *prev_link = i; err = tipc_nl_add_bc_link(net, msg, node->bc_entry.link); if (err) return err; } *prev_link = 0; return 0; } int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nlattr **attrs = genl_dumpit_info(cb)->info.attrs; struct nlattr *link[TIPC_NLA_LINK_MAX + 1]; struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *node; struct tipc_nl_msg msg; u32 prev_node = cb->args[0]; u32 prev_link = cb->args[1]; int done = cb->args[2]; bool bc_link = cb->args[3]; int err; if (done) return 0; if (!prev_node) { /* Check if broadcast-receiver links dumping is needed */ if (attrs && attrs[TIPC_NLA_LINK]) { err = nla_parse_nested_deprecated(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], tipc_nl_link_policy, NULL); if (unlikely(err)) return err; if (unlikely(!link[TIPC_NLA_LINK_BROADCAST])) return -EINVAL; bc_link = true; } } msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); if (prev_node) { node = tipc_node_find(net, prev_node); if (!node) { /* We never set seq or call nl_dump_check_consistent() * this means that setting prev_seq here will cause the * consistence check to fail in the netlink callback * handler. Resulting in the last NLMSG_DONE message * having the NLM_F_DUMP_INTR flag set. */ cb->prev_seq = 1; goto out; } tipc_node_put(node); list_for_each_entry_continue_rcu(node, &tn->node_list, list) { tipc_node_read_lock(node); err = __tipc_nl_add_node_links(net, &msg, node, &prev_link, bc_link); tipc_node_read_unlock(node); if (err) goto out; prev_node = node->addr; } } else { err = tipc_nl_add_bc_link(net, &msg, tn->bcl); if (err) goto out; list_for_each_entry_rcu(node, &tn->node_list, list) { tipc_node_read_lock(node); err = __tipc_nl_add_node_links(net, &msg, node, &prev_link, bc_link); tipc_node_read_unlock(node); if (err) goto out; prev_node = node->addr; } } done = 1; out: rcu_read_unlock(); cb->args[0] = prev_node; cb->args[1] = prev_link; cb->args[2] = done; cb->args[3] = bc_link; return skb->len; } int tipc_nl_node_set_monitor(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attrs[TIPC_NLA_MON_MAX + 1]; struct net *net = sock_net(skb->sk); int err; if (!info->attrs[TIPC_NLA_MON]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_MON_MAX, info->attrs[TIPC_NLA_MON], tipc_nl_monitor_policy, info->extack); if (err) return err; if (attrs[TIPC_NLA_MON_ACTIVATION_THRESHOLD]) { u32 val; val = nla_get_u32(attrs[TIPC_NLA_MON_ACTIVATION_THRESHOLD]); err = tipc_nl_monitor_set_threshold(net, val); if (err) return err; } return 0; } static int __tipc_nl_add_monitor_prop(struct net *net, struct tipc_nl_msg *msg) { struct nlattr *attrs; void *hdr; u32 val; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, 0, TIPC_NL_MON_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_MON); if (!attrs) goto msg_full; val = tipc_nl_monitor_get_threshold(net); if (nla_put_u32(msg->skb, TIPC_NLA_MON_ACTIVATION_THRESHOLD, val)) goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_nl_msg msg; int err; msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!msg.skb) return -ENOMEM; msg.portid = info->snd_portid; msg.seq = info->snd_seq; err = __tipc_nl_add_monitor_prop(net, &msg); if (err) { nlmsg_free(msg.skb); return err; } return genlmsg_reply(msg.skb, info); } int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); u32 prev_bearer = cb->args[0]; struct tipc_nl_msg msg; int bearer_id; int err; if (prev_bearer == MAX_BEARERS) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rtnl_lock(); for (bearer_id = prev_bearer; bearer_id < MAX_BEARERS; bearer_id++) { err = __tipc_nl_add_monitor(net, &msg, bearer_id); if (err) break; } rtnl_unlock(); cb->args[0] = bearer_id; return skb->len; } int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); u32 prev_node = cb->args[1]; u32 bearer_id = cb->args[2]; int done = cb->args[0]; struct tipc_nl_msg msg; int err; if (!prev_node) { struct nlattr **attrs = genl_dumpit_info(cb)->info.attrs; struct nlattr *mon[TIPC_NLA_MON_MAX + 1]; if (!attrs[TIPC_NLA_MON]) return -EINVAL; err = nla_parse_nested_deprecated(mon, TIPC_NLA_MON_MAX, attrs[TIPC_NLA_MON], tipc_nl_monitor_policy, NULL); if (err) return err; if (!mon[TIPC_NLA_MON_REF]) return -EINVAL; bearer_id = nla_get_u32(mon[TIPC_NLA_MON_REF]); if (bearer_id >= MAX_BEARERS) return -EINVAL; } if (done) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rtnl_lock(); err = tipc_nl_add_monitor_peer(net, &msg, bearer_id, &prev_node); if (!err) done = 1; rtnl_unlock(); cb->args[0] = done; cb->args[1] = prev_node; cb->args[2] = bearer_id; return skb->len; } #ifdef CONFIG_TIPC_CRYPTO static int tipc_nl_retrieve_key(struct nlattr **attrs, struct tipc_aead_key **pkey) { struct nlattr *attr = attrs[TIPC_NLA_NODE_KEY]; struct tipc_aead_key *key; if (!attr) return -ENODATA; if (nla_len(attr) < sizeof(*key)) return -EINVAL; key = (struct tipc_aead_key *)nla_data(attr); if (key->keylen > TIPC_AEAD_KEYLEN_MAX || nla_len(attr) < tipc_aead_key_size(key)) return -EINVAL; *pkey = key; return 0; } static int tipc_nl_retrieve_nodeid(struct nlattr **attrs, u8 **node_id) { struct nlattr *attr = attrs[TIPC_NLA_NODE_ID]; if (!attr) return -ENODATA; if (nla_len(attr) < TIPC_NODEID_LEN) return -EINVAL; *node_id = (u8 *)nla_data(attr); return 0; } static int tipc_nl_retrieve_rekeying(struct nlattr **attrs, u32 *intv) { struct nlattr *attr = attrs[TIPC_NLA_NODE_REKEYING]; if (!attr) return -ENODATA; *intv = nla_get_u32(attr); return 0; } static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attrs[TIPC_NLA_NODE_MAX + 1]; struct net *net = sock_net(skb->sk); struct tipc_crypto *tx = tipc_net(net)->crypto_tx, *c = tx; struct tipc_node *n = NULL; struct tipc_aead_key *ukey; bool rekeying = true, master_key = false; u8 *id, *own_id, mode; u32 intv = 0; int rc = 0; if (!info->attrs[TIPC_NLA_NODE]) return -EINVAL; rc = nla_parse_nested(attrs, TIPC_NLA_NODE_MAX, info->attrs[TIPC_NLA_NODE], tipc_nl_node_policy, info->extack); if (rc) return rc; own_id = tipc_own_id(net); if (!own_id) { GENL_SET_ERR_MSG(info, "not found own node identity (set id?)"); return -EPERM; } rc = tipc_nl_retrieve_rekeying(attrs, &intv); if (rc == -ENODATA) rekeying = false; rc = tipc_nl_retrieve_key(attrs, &ukey); if (rc == -ENODATA && rekeying) goto rekeying; else if (rc) return rc; rc = tipc_aead_key_validate(ukey, info); if (rc) return rc; rc = tipc_nl_retrieve_nodeid(attrs, &id); switch (rc) { case -ENODATA: mode = CLUSTER_KEY; master_key = !!(attrs[TIPC_NLA_NODE_KEY_MASTER]); break; case 0: mode = PER_NODE_KEY; if (memcmp(id, own_id, NODE_ID_LEN)) { n = tipc_node_find_by_id(net, id) ?: tipc_node_create(net, 0, id, 0xffffu, 0, true); if (unlikely(!n)) return -ENOMEM; c = n->crypto_rx; } break; default: return rc; } /* Initiate the TX/RX key */ rc = tipc_crypto_key_init(c, ukey, mode, master_key); if (n) tipc_node_put(n); if (unlikely(rc < 0)) { GENL_SET_ERR_MSG(info, "unable to initiate or attach new key"); return rc; } else if (c == tx) { /* Distribute TX key but not master one */ if (!master_key && tipc_crypto_key_distr(tx, rc, NULL)) GENL_SET_ERR_MSG(info, "failed to replicate new key"); rekeying: /* Schedule TX rekeying if needed */ tipc_crypto_rekeying_sched(tx, rekeying, intv); } return 0; } int tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) { int err; rtnl_lock(); err = __tipc_nl_node_set_key(skb, info); rtnl_unlock(); return err; } static int __tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_net *tn = tipc_net(net); struct tipc_node *n; tipc_crypto_key_flush(tn->crypto_tx); rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) tipc_crypto_key_flush(n->crypto_rx); rcu_read_unlock(); return 0; } int tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info) { int err; rtnl_lock(); err = __tipc_nl_node_flush_key(skb, info); rtnl_unlock(); return err; } #endif /** * tipc_node_dump - dump TIPC node data * @n: tipc node to be dumped * @more: dump more? * - false: dump only tipc node data * - true: dump node link data as well * @buf: returned buffer of dump data in format */ int tipc_node_dump(struct tipc_node *n, bool more, char *buf) { int i = 0; size_t sz = (more) ? NODE_LMAX : NODE_LMIN; if (!n) { i += scnprintf(buf, sz, "node data: (null)\n"); return i; } i += scnprintf(buf, sz, "node data: %x", n->addr); i += scnprintf(buf + i, sz - i, " %x", n->state); i += scnprintf(buf + i, sz - i, " %d", n->active_links[0]); i += scnprintf(buf + i, sz - i, " %d", n->active_links[1]); i += scnprintf(buf + i, sz - i, " %x", n->action_flags); i += scnprintf(buf + i, sz - i, " %u", n->failover_sent); i += scnprintf(buf + i, sz - i, " %u", n->sync_point); i += scnprintf(buf + i, sz - i, " %d", n->link_cnt); i += scnprintf(buf + i, sz - i, " %u", n->working_links); i += scnprintf(buf + i, sz - i, " %x", n->capabilities); i += scnprintf(buf + i, sz - i, " %lu\n", n->keepalive_intv); if (!more) return i; i += scnprintf(buf + i, sz - i, "link_entry[0]:\n"); i += scnprintf(buf + i, sz - i, " mtu: %u\n", n->links[0].mtu); i += scnprintf(buf + i, sz - i, " media: "); i += tipc_media_addr_printf(buf + i, sz - i, &n->links[0].maddr); i += scnprintf(buf + i, sz - i, "\n"); i += tipc_link_dump(n->links[0].link, TIPC_DUMP_NONE, buf + i); i += scnprintf(buf + i, sz - i, " inputq: "); i += tipc_list_dump(&n->links[0].inputq, false, buf + i); i += scnprintf(buf + i, sz - i, "link_entry[1]:\n"); i += scnprintf(buf + i, sz - i, " mtu: %u\n", n->links[1].mtu); i += scnprintf(buf + i, sz - i, " media: "); i += tipc_media_addr_printf(buf + i, sz - i, &n->links[1].maddr); i += scnprintf(buf + i, sz - i, "\n"); i += tipc_link_dump(n->links[1].link, TIPC_DUMP_NONE, buf + i); i += scnprintf(buf + i, sz - i, " inputq: "); i += tipc_list_dump(&n->links[1].inputq, false, buf + i); i += scnprintf(buf + i, sz - i, "bclink:\n "); i += tipc_link_dump(n->bc_entry.link, TIPC_DUMP_NONE, buf + i); return i; } void tipc_node_pre_cleanup_net(struct net *exit_net) { struct tipc_node *n; struct tipc_net *tn; struct net *tmp; rcu_read_lock(); for_each_net_rcu(tmp) { if (tmp == exit_net) continue; tn = tipc_net(tmp); if (!tn) continue; spin_lock_bh(&tn->node_list_lock); list_for_each_entry_rcu(n, &tn->node_list, list) { if (!n->peer_net) continue; if (n->peer_net != exit_net) continue; tipc_node_write_lock(n); n->peer_net = NULL; n->peer_hash_mix = 0; tipc_node_write_unlock_fast(n); break; } spin_unlock_bh(&tn->node_list_lock); } rcu_read_unlock(); }
1 1 1 1 1 1 3 1 1 1 1 1 3 1 3 1 1 1 1 1 1 1 1 1 1 1 2 3 3 1 1 2 1 3 3 3 3 3 1 2 1 1 1 1 1 4 4 1 3 1 3 1 3 4 1 3 1 3 3 3 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 // SPDX-License-Identifier: GPL-2.0-or-later /* * uvc_driver.c -- USB Video Class driver * * Copyright (C) 2005-2010 * Laurent Pinchart (laurent.pinchart@ideasonboard.com) */ #include <linux/atomic.h> #include <linux/bits.h> #include <linux/gpio/consumer.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/usb.h> #include <linux/usb/uvc.h> #include <linux/videodev2.h> #include <linux/vmalloc.h> #include <linux/wait.h> #include <asm/unaligned.h> #include <media/v4l2-common.h> #include <media/v4l2-ioctl.h> #include "uvcvideo.h" #define DRIVER_AUTHOR "Laurent Pinchart " \ "<laurent.pinchart@ideasonboard.com>" #define DRIVER_DESC "USB Video Class driver" unsigned int uvc_clock_param = CLOCK_MONOTONIC; unsigned int uvc_hw_timestamps_param; unsigned int uvc_no_drop_param; static unsigned int uvc_quirks_param = -1; unsigned int uvc_dbg_param; unsigned int uvc_timeout_param = UVC_CTRL_STREAMING_TIMEOUT; /* ------------------------------------------------------------------------ * Utility functions */ struct usb_host_endpoint *uvc_find_endpoint(struct usb_host_interface *alts, u8 epaddr) { struct usb_host_endpoint *ep; unsigned int i; for (i = 0; i < alts->desc.bNumEndpoints; ++i) { ep = &alts->endpoint[i]; if (ep->desc.bEndpointAddress == epaddr) return ep; } return NULL; } static enum v4l2_colorspace uvc_colorspace(const u8 primaries) { static const enum v4l2_colorspace colorprimaries[] = { V4L2_COLORSPACE_SRGB, /* Unspecified */ V4L2_COLORSPACE_SRGB, V4L2_COLORSPACE_470_SYSTEM_M, V4L2_COLORSPACE_470_SYSTEM_BG, V4L2_COLORSPACE_SMPTE170M, V4L2_COLORSPACE_SMPTE240M, }; if (primaries < ARRAY_SIZE(colorprimaries)) return colorprimaries[primaries]; return V4L2_COLORSPACE_SRGB; /* Reserved */ } static enum v4l2_xfer_func uvc_xfer_func(const u8 transfer_characteristics) { /* * V4L2 does not currently have definitions for all possible values of * UVC transfer characteristics. If v4l2_xfer_func is extended with new * values, the mapping below should be updated. * * Substitutions are taken from the mapping given for * V4L2_XFER_FUNC_DEFAULT documented in videodev2.h. */ static const enum v4l2_xfer_func xfer_funcs[] = { V4L2_XFER_FUNC_DEFAULT, /* Unspecified */ V4L2_XFER_FUNC_709, V4L2_XFER_FUNC_709, /* Substitution for BT.470-2 M */ V4L2_XFER_FUNC_709, /* Substitution for BT.470-2 B, G */ V4L2_XFER_FUNC_709, /* Substitution for SMPTE 170M */ V4L2_XFER_FUNC_SMPTE240M, V4L2_XFER_FUNC_NONE, V4L2_XFER_FUNC_SRGB, }; if (transfer_characteristics < ARRAY_SIZE(xfer_funcs)) return xfer_funcs[transfer_characteristics]; return V4L2_XFER_FUNC_DEFAULT; /* Reserved */ } static enum v4l2_ycbcr_encoding uvc_ycbcr_enc(const u8 matrix_coefficients) { /* * V4L2 does not currently have definitions for all possible values of * UVC matrix coefficients. If v4l2_ycbcr_encoding is extended with new * values, the mapping below should be updated. * * Substitutions are taken from the mapping given for * V4L2_YCBCR_ENC_DEFAULT documented in videodev2.h. * * FCC is assumed to be close enough to 601. */ static const enum v4l2_ycbcr_encoding ycbcr_encs[] = { V4L2_YCBCR_ENC_DEFAULT, /* Unspecified */ V4L2_YCBCR_ENC_709, V4L2_YCBCR_ENC_601, /* Substitution for FCC */ V4L2_YCBCR_ENC_601, /* Substitution for BT.470-2 B, G */ V4L2_YCBCR_ENC_601, V4L2_YCBCR_ENC_SMPTE240M, }; if (matrix_coefficients < ARRAY_SIZE(ycbcr_encs)) return ycbcr_encs[matrix_coefficients]; return V4L2_YCBCR_ENC_DEFAULT; /* Reserved */ } /* ------------------------------------------------------------------------ * Terminal and unit management */ struct uvc_entity *uvc_entity_by_id(struct uvc_device *dev, int id) { struct uvc_entity *entity; list_for_each_entry(entity, &dev->entities, list) { if (entity->id == id) return entity; } return NULL; } static struct uvc_entity *uvc_entity_by_reference(struct uvc_device *dev, int id, struct uvc_entity *entity) { unsigned int i; if (entity == NULL) entity = list_entry(&dev->entities, struct uvc_entity, list); list_for_each_entry_continue(entity, &dev->entities, list) { for (i = 0; i < entity->bNrInPins; ++i) if (entity->baSourceID[i] == id) return entity; } return NULL; } static struct uvc_streaming *uvc_stream_by_id(struct uvc_device *dev, int id) { struct uvc_streaming *stream; list_for_each_entry(stream, &dev->streams, list) { if (stream->header.bTerminalLink == id) return stream; } return NULL; } /* ------------------------------------------------------------------------ * Streaming Object Management */ static void uvc_stream_delete(struct uvc_streaming *stream) { if (stream->async_wq) destroy_workqueue(stream->async_wq); mutex_destroy(&stream->mutex); usb_put_intf(stream->intf); kfree(stream->formats); kfree(stream->header.bmaControls); kfree(stream); } static struct uvc_streaming *uvc_stream_new(struct uvc_device *dev, struct usb_interface *intf) { struct uvc_streaming *stream; stream = kzalloc(sizeof(*stream), GFP_KERNEL); if (stream == NULL) return NULL; mutex_init(&stream->mutex); stream->dev = dev; stream->intf = usb_get_intf(intf); stream->intfnum = intf->cur_altsetting->desc.bInterfaceNumber; /* Allocate a stream specific work queue for asynchronous tasks. */ stream->async_wq = alloc_workqueue("uvcvideo", WQ_UNBOUND | WQ_HIGHPRI, 0); if (!stream->async_wq) { uvc_stream_delete(stream); return NULL; } return stream; } /* ------------------------------------------------------------------------ * Descriptors parsing */ static int uvc_parse_format(struct uvc_device *dev, struct uvc_streaming *streaming, struct uvc_format *format, struct uvc_frame *frames, u32 **intervals, const unsigned char *buffer, int buflen) { struct usb_interface *intf = streaming->intf; struct usb_host_interface *alts = intf->cur_altsetting; const struct uvc_format_desc *fmtdesc; struct uvc_frame *frame; const unsigned char *start = buffer; unsigned int width_multiplier = 1; unsigned int interval; unsigned int i, n; u8 ftype; format->type = buffer[2]; format->index = buffer[3]; format->frames = frames; switch (buffer[2]) { case UVC_VS_FORMAT_UNCOMPRESSED: case UVC_VS_FORMAT_FRAME_BASED: n = buffer[2] == UVC_VS_FORMAT_UNCOMPRESSED ? 27 : 28; if (buflen < n) { uvc_dbg(dev, DESCR, "device %d videostreaming interface %d FORMAT error\n", dev->udev->devnum, alts->desc.bInterfaceNumber); return -EINVAL; } /* Find the format descriptor from its GUID. */ fmtdesc = uvc_format_by_guid(&buffer[5]); if (!fmtdesc) { /* * Unknown video formats are not fatal errors, the * caller will skip this descriptor. */ dev_info(&streaming->intf->dev, "Unknown video format %pUl\n", &buffer[5]); return 0; } format->fcc = fmtdesc->fcc; format->bpp = buffer[21]; /* * Some devices report a format that doesn't match what they * really send. */ if (dev->quirks & UVC_QUIRK_FORCE_Y8) { if (format->fcc == V4L2_PIX_FMT_YUYV) { format->fcc = V4L2_PIX_FMT_GREY; format->bpp = 8; width_multiplier = 2; } } /* Some devices report bpp that doesn't match the format. */ if (dev->quirks & UVC_QUIRK_FORCE_BPP) { const struct v4l2_format_info *info = v4l2_format_info(format->fcc); if (info) { unsigned int div = info->hdiv * info->vdiv; n = info->bpp[0] * div; for (i = 1; i < info->comp_planes; i++) n += info->bpp[i]; format->bpp = DIV_ROUND_UP(8 * n, div); } } if (buffer[2] == UVC_VS_FORMAT_UNCOMPRESSED) { ftype = UVC_VS_FRAME_UNCOMPRESSED; } else { ftype = UVC_VS_FRAME_FRAME_BASED; if (buffer[27]) format->flags = UVC_FMT_FLAG_COMPRESSED; } break; case UVC_VS_FORMAT_MJPEG: if (buflen < 11) { uvc_dbg(dev, DESCR, "device %d videostreaming interface %d FORMAT error\n", dev->udev->devnum, alts->desc.bInterfaceNumber); return -EINVAL; } format->fcc = V4L2_PIX_FMT_MJPEG; format->flags = UVC_FMT_FLAG_COMPRESSED; format->bpp = 0; ftype = UVC_VS_FRAME_MJPEG; break; case UVC_VS_FORMAT_DV: if (buflen < 9) { uvc_dbg(dev, DESCR, "device %d videostreaming interface %d FORMAT error\n", dev->udev->devnum, alts->desc.bInterfaceNumber); return -EINVAL; } if ((buffer[8] & 0x7f) > 2) { uvc_dbg(dev, DESCR, "device %d videostreaming interface %d: unknown DV format %u\n", dev->udev->devnum, alts->desc.bInterfaceNumber, buffer[8]); return -EINVAL; } format->fcc = V4L2_PIX_FMT_DV; format->flags = UVC_FMT_FLAG_COMPRESSED | UVC_FMT_FLAG_STREAM; format->bpp = 0; ftype = 0; /* Create a dummy frame descriptor. */ frame = &frames[0]; memset(frame, 0, sizeof(*frame)); frame->bFrameIntervalType = 1; frame->dwDefaultFrameInterval = 1; frame->dwFrameInterval = *intervals; *(*intervals)++ = 1; format->nframes = 1; break; case UVC_VS_FORMAT_MPEG2TS: case UVC_VS_FORMAT_STREAM_BASED: /* Not supported yet. */ default: uvc_dbg(dev, DESCR, "device %d videostreaming interface %d unsupported format %u\n", dev->udev->devnum, alts->desc.bInterfaceNumber, buffer[2]); return -EINVAL; } uvc_dbg(dev, DESCR, "Found format %p4cc", &format->fcc); buflen -= buffer[0]; buffer += buffer[0]; /* * Parse the frame descriptors. Only uncompressed, MJPEG and frame * based formats have frame descriptors. */ while (buflen > 2 && buffer[1] == USB_DT_CS_INTERFACE && buffer[2] == ftype) { unsigned int maxIntervalIndex; frame = &frames[format->nframes]; if (ftype != UVC_VS_FRAME_FRAME_BASED) n = buflen > 25 ? buffer[25] : 0; else n = buflen > 21 ? buffer[21] : 0; n = n ? n : 3; if (buflen < 26 + 4*n) { uvc_dbg(dev, DESCR, "device %d videostreaming interface %d FRAME error\n", dev->udev->devnum, alts->desc.bInterfaceNumber); return -EINVAL; } frame->bFrameIndex = buffer[3]; frame->bmCapabilities = buffer[4]; frame->wWidth = get_unaligned_le16(&buffer[5]) * width_multiplier; frame->wHeight = get_unaligned_le16(&buffer[7]); frame->dwMinBitRate = get_unaligned_le32(&buffer[9]); frame->dwMaxBitRate = get_unaligned_le32(&buffer[13]); if (ftype != UVC_VS_FRAME_FRAME_BASED) { frame->dwMaxVideoFrameBufferSize = get_unaligned_le32(&buffer[17]); frame->dwDefaultFrameInterval = get_unaligned_le32(&buffer[21]); frame->bFrameIntervalType = buffer[25]; } else { frame->dwMaxVideoFrameBufferSize = 0; frame->dwDefaultFrameInterval = get_unaligned_le32(&buffer[17]); frame->bFrameIntervalType = buffer[21]; } /* * Copy the frame intervals. * * Some bogus devices report dwMinFrameInterval equal to * dwMaxFrameInterval and have dwFrameIntervalStep set to * zero. Setting all null intervals to 1 fixes the problem and * some other divisions by zero that could happen. */ frame->dwFrameInterval = *intervals; for (i = 0; i < n; ++i) { interval = get_unaligned_le32(&buffer[26+4*i]); (*intervals)[i] = interval ? interval : 1; } /* * Apply more fixes, quirks and workarounds to handle incorrect * or broken descriptors. */ /* * Several UVC chipsets screw up dwMaxVideoFrameBufferSize * completely. Observed behaviours range from setting the * value to 1.1x the actual frame size to hardwiring the * 16 low bits to 0. This results in a higher than necessary * memory usage as well as a wrong image size information. For * uncompressed formats this can be fixed by computing the * value from the frame size. */ if (!(format->flags & UVC_FMT_FLAG_COMPRESSED)) frame->dwMaxVideoFrameBufferSize = format->bpp * frame->wWidth * frame->wHeight / 8; /* * Clamp the default frame interval to the boundaries. A zero * bFrameIntervalType value indicates a continuous frame * interval range, with dwFrameInterval[0] storing the minimum * value and dwFrameInterval[1] storing the maximum value. */ maxIntervalIndex = frame->bFrameIntervalType ? n - 1 : 1; frame->dwDefaultFrameInterval = clamp(frame->dwDefaultFrameInterval, frame->dwFrameInterval[0], frame->dwFrameInterval[maxIntervalIndex]); /* * Some devices report frame intervals that are not functional. * If the corresponding quirk is set, restrict operation to the * first interval only. */ if (dev->quirks & UVC_QUIRK_RESTRICT_FRAME_RATE) { frame->bFrameIntervalType = 1; (*intervals)[0] = frame->dwDefaultFrameInterval; } uvc_dbg(dev, DESCR, "- %ux%u (%u.%u fps)\n", frame->wWidth, frame->wHeight, 10000000 / frame->dwDefaultFrameInterval, (100000000 / frame->dwDefaultFrameInterval) % 10); format->nframes++; *intervals += n; buflen -= buffer[0]; buffer += buffer[0]; } if (buflen > 2 && buffer[1] == USB_DT_CS_INTERFACE && buffer[2] == UVC_VS_STILL_IMAGE_FRAME) { buflen -= buffer[0]; buffer += buffer[0]; } if (buflen > 2 && buffer[1] == USB_DT_CS_INTERFACE && buffer[2] == UVC_VS_COLORFORMAT) { if (buflen < 6) { uvc_dbg(dev, DESCR, "device %d videostreaming interface %d COLORFORMAT error\n", dev->udev->devnum, alts->desc.bInterfaceNumber); return -EINVAL; } format->colorspace = uvc_colorspace(buffer[3]); format->xfer_func = uvc_xfer_func(buffer[4]); format->ycbcr_enc = uvc_ycbcr_enc(buffer[5]); buflen -= buffer[0]; buffer += buffer[0]; } else { format->colorspace = V4L2_COLORSPACE_SRGB; } return buffer - start; } static int uvc_parse_streaming(struct uvc_device *dev, struct usb_interface *intf) { struct uvc_streaming *streaming = NULL; struct uvc_format *format; struct uvc_frame *frame; struct usb_host_interface *alts = &intf->altsetting[0]; const unsigned char *_buffer, *buffer = alts->extra; int _buflen, buflen = alts->extralen; unsigned int nformats = 0, nframes = 0, nintervals = 0; unsigned int size, i, n, p; u32 *interval; u16 psize; int ret = -EINVAL; if (intf->cur_altsetting->desc.bInterfaceSubClass != UVC_SC_VIDEOSTREAMING) { uvc_dbg(dev, DESCR, "device %d interface %d isn't a video streaming interface\n", dev->udev->devnum, intf->altsetting[0].desc.bInterfaceNumber); return -EINVAL; } if (usb_driver_claim_interface(&uvc_driver.driver, intf, dev)) { uvc_dbg(dev, DESCR, "device %d interface %d is already claimed\n", dev->udev->devnum, intf->altsetting[0].desc.bInterfaceNumber); return -EINVAL; } streaming = uvc_stream_new(dev, intf); if (streaming == NULL) { usb_driver_release_interface(&uvc_driver.driver, intf); return -ENOMEM; } /* * The Pico iMage webcam has its class-specific interface descriptors * after the endpoint descriptors. */ if (buflen == 0) { for (i = 0; i < alts->desc.bNumEndpoints; ++i) { struct usb_host_endpoint *ep = &alts->endpoint[i]; if (ep->extralen == 0) continue; if (ep->extralen > 2 && ep->extra[1] == USB_DT_CS_INTERFACE) { uvc_dbg(dev, DESCR, "trying extra data from endpoint %u\n", i); buffer = alts->endpoint[i].extra; buflen = alts->endpoint[i].extralen; break; } } } /* Skip the standard interface descriptors. */ while (buflen > 2 && buffer[1] != USB_DT_CS_INTERFACE) { buflen -= buffer[0]; buffer += buffer[0]; } if (buflen <= 2) { uvc_dbg(dev, DESCR, "no class-specific streaming interface descriptors found\n"); goto error; } /* Parse the header descriptor. */ switch (buffer[2]) { case UVC_VS_OUTPUT_HEADER: streaming->type = V4L2_BUF_TYPE_VIDEO_OUTPUT; size = 9; break; case UVC_VS_INPUT_HEADER: streaming->type = V4L2_BUF_TYPE_VIDEO_CAPTURE; size = 13; break; default: uvc_dbg(dev, DESCR, "device %d videostreaming interface %d HEADER descriptor not found\n", dev->udev->devnum, alts->desc.bInterfaceNumber); goto error; } p = buflen >= 4 ? buffer[3] : 0; n = buflen >= size ? buffer[size-1] : 0; if (buflen < size + p*n) { uvc_dbg(dev, DESCR, "device %d videostreaming interface %d HEADER descriptor is invalid\n", dev->udev->devnum, alts->desc.bInterfaceNumber); goto error; } streaming->header.bNumFormats = p; streaming->header.bEndpointAddress = buffer[6]; if (buffer[2] == UVC_VS_INPUT_HEADER) { streaming->header.bmInfo = buffer[7]; streaming->header.bTerminalLink = buffer[8]; streaming->header.bStillCaptureMethod = buffer[9]; streaming->header.bTriggerSupport = buffer[10]; streaming->header.bTriggerUsage = buffer[11]; } else { streaming->header.bTerminalLink = buffer[7]; } streaming->header.bControlSize = n; streaming->header.bmaControls = kmemdup(&buffer[size], p * n, GFP_KERNEL); if (streaming->header.bmaControls == NULL) { ret = -ENOMEM; goto error; } buflen -= buffer[0]; buffer += buffer[0]; _buffer = buffer; _buflen = buflen; /* Count the format and frame descriptors. */ while (_buflen > 2 && _buffer[1] == USB_DT_CS_INTERFACE) { switch (_buffer[2]) { case UVC_VS_FORMAT_UNCOMPRESSED: case UVC_VS_FORMAT_MJPEG: case UVC_VS_FORMAT_FRAME_BASED: nformats++; break; case UVC_VS_FORMAT_DV: /* * DV format has no frame descriptor. We will create a * dummy frame descriptor with a dummy frame interval. */ nformats++; nframes++; nintervals++; break; case UVC_VS_FORMAT_MPEG2TS: case UVC_VS_FORMAT_STREAM_BASED: uvc_dbg(dev, DESCR, "device %d videostreaming interface %d FORMAT %u is not supported\n", dev->udev->devnum, alts->desc.bInterfaceNumber, _buffer[2]); break; case UVC_VS_FRAME_UNCOMPRESSED: case UVC_VS_FRAME_MJPEG: nframes++; if (_buflen > 25) nintervals += _buffer[25] ? _buffer[25] : 3; break; case UVC_VS_FRAME_FRAME_BASED: nframes++; if (_buflen > 21) nintervals += _buffer[21] ? _buffer[21] : 3; break; } _buflen -= _buffer[0]; _buffer += _buffer[0]; } if (nformats == 0) { uvc_dbg(dev, DESCR, "device %d videostreaming interface %d has no supported formats defined\n", dev->udev->devnum, alts->desc.bInterfaceNumber); goto error; } size = nformats * sizeof(*format) + nframes * sizeof(*frame) + nintervals * sizeof(*interval); format = kzalloc(size, GFP_KERNEL); if (format == NULL) { ret = -ENOMEM; goto error; } frame = (struct uvc_frame *)&format[nformats]; interval = (u32 *)&frame[nframes]; streaming->formats = format; streaming->nformats = 0; /* Parse the format descriptors. */ while (buflen > 2 && buffer[1] == USB_DT_CS_INTERFACE) { switch (buffer[2]) { case UVC_VS_FORMAT_UNCOMPRESSED: case UVC_VS_FORMAT_MJPEG: case UVC_VS_FORMAT_DV: case UVC_VS_FORMAT_FRAME_BASED: ret = uvc_parse_format(dev, streaming, format, frame, &interval, buffer, buflen); if (ret < 0) goto error; if (!ret) break; streaming->nformats++; frame += format->nframes; format++; buflen -= ret; buffer += ret; continue; default: break; } buflen -= buffer[0]; buffer += buffer[0]; } if (buflen) uvc_dbg(dev, DESCR, "device %d videostreaming interface %d has %u bytes of trailing descriptor garbage\n", dev->udev->devnum, alts->desc.bInterfaceNumber, buflen); /* Parse the alternate settings to find the maximum bandwidth. */ for (i = 0; i < intf->num_altsetting; ++i) { struct usb_host_endpoint *ep; alts = &intf->altsetting[i]; ep = uvc_find_endpoint(alts, streaming->header.bEndpointAddress); if (ep == NULL) continue; psize = uvc_endpoint_max_bpi(dev->udev, ep); if (psize > streaming->maxpsize) streaming->maxpsize = psize; } list_add_tail(&streaming->list, &dev->streams); return 0; error: usb_driver_release_interface(&uvc_driver.driver, intf); uvc_stream_delete(streaming); return ret; } static const u8 uvc_camera_guid[16] = UVC_GUID_UVC_CAMERA; static const u8 uvc_gpio_guid[16] = UVC_GUID_EXT_GPIO_CONTROLLER; static const u8 uvc_media_transport_input_guid[16] = UVC_GUID_UVC_MEDIA_TRANSPORT_INPUT; static const u8 uvc_processing_guid[16] = UVC_GUID_UVC_PROCESSING; static struct uvc_entity *uvc_alloc_entity(u16 type, u16 id, unsigned int num_pads, unsigned int extra_size) { struct uvc_entity *entity; unsigned int num_inputs; unsigned int size; unsigned int i; extra_size = roundup(extra_size, sizeof(*entity->pads)); if (num_pads) num_inputs = type & UVC_TERM_OUTPUT ? num_pads : num_pads - 1; else num_inputs = 0; size = sizeof(*entity) + extra_size + sizeof(*entity->pads) * num_pads + num_inputs; entity = kzalloc(size, GFP_KERNEL); if (entity == NULL) return NULL; entity->id = id; entity->type = type; /* * Set the GUID for standard entity types. For extension units, the GUID * is initialized by the caller. */ switch (type) { case UVC_EXT_GPIO_UNIT: memcpy(entity->guid, uvc_gpio_guid, 16); break; case UVC_ITT_CAMERA: memcpy(entity->guid, uvc_camera_guid, 16); break; case UVC_ITT_MEDIA_TRANSPORT_INPUT: memcpy(entity->guid, uvc_media_transport_input_guid, 16); break; case UVC_VC_PROCESSING_UNIT: memcpy(entity->guid, uvc_processing_guid, 16); break; } entity->num_links = 0; entity->num_pads = num_pads; entity->pads = ((void *)(entity + 1)) + extra_size; for (i = 0; i < num_inputs; ++i) entity->pads[i].flags = MEDIA_PAD_FL_SINK; if (!UVC_ENTITY_IS_OTERM(entity) && num_pads) entity->pads[num_pads-1].flags = MEDIA_PAD_FL_SOURCE; entity->bNrInPins = num_inputs; entity->baSourceID = (u8 *)(&entity->pads[num_pads]); return entity; } static void uvc_entity_set_name(struct uvc_device *dev, struct uvc_entity *entity, const char *type_name, u8 string_id) { int ret; /* * First attempt to read the entity name from the device. If the entity * has no associated string, or if reading the string fails (most * likely due to a buggy firmware), fall back to default names based on * the entity type. */ if (string_id) { ret = usb_string(dev->udev, string_id, entity->name, sizeof(entity->name)); if (!ret) return; } sprintf(entity->name, "%s %u", type_name, entity->id); } /* Parse vendor-specific extensions. */ static int uvc_parse_vendor_control(struct uvc_device *dev, const unsigned char *buffer, int buflen) { struct usb_device *udev = dev->udev; struct usb_host_interface *alts = dev->intf->cur_altsetting; struct uvc_entity *unit; unsigned int n, p; int handled = 0; switch (le16_to_cpu(dev->udev->descriptor.idVendor)) { case 0x046d: /* Logitech */ if (buffer[1] != 0x41 || buffer[2] != 0x01) break; /* * Logitech implements several vendor specific functions * through vendor specific extension units (LXU). * * The LXU descriptors are similar to XU descriptors * (see "USB Device Video Class for Video Devices", section * 3.7.2.6 "Extension Unit Descriptor") with the following * differences: * * ---------------------------------------------------------- * 0 bLength 1 Number * Size of this descriptor, in bytes: 24+p+n*2 * ---------------------------------------------------------- * 23+p+n bmControlsType N Bitmap * Individual bits in the set are defined: * 0: Absolute * 1: Relative * * This bitset is mapped exactly the same as bmControls. * ---------------------------------------------------------- * 23+p+n*2 bReserved 1 Boolean * ---------------------------------------------------------- * 24+p+n*2 iExtension 1 Index * Index of a string descriptor that describes this * extension unit. * ---------------------------------------------------------- */ p = buflen >= 22 ? buffer[21] : 0; n = buflen >= 25 + p ? buffer[22+p] : 0; if (buflen < 25 + p + 2*n) { uvc_dbg(dev, DESCR, "device %d videocontrol interface %d EXTENSION_UNIT error\n", udev->devnum, alts->desc.bInterfaceNumber); break; } unit = uvc_alloc_entity(UVC_VC_EXTENSION_UNIT, buffer[3], p + 1, 2*n); if (unit == NULL) return -ENOMEM; memcpy(unit->guid, &buffer[4], 16); unit->extension.bNumControls = buffer[20]; memcpy(unit->baSourceID, &buffer[22], p); unit->extension.bControlSize = buffer[22+p]; unit->extension.bmControls = (u8 *)unit + sizeof(*unit); unit->extension.bmControlsType = (u8 *)unit + sizeof(*unit) + n; memcpy(unit->extension.bmControls, &buffer[23+p], 2*n); uvc_entity_set_name(dev, unit, "Extension", buffer[24+p+2*n]); list_add_tail(&unit->list, &dev->entities); handled = 1; break; } return handled; } static int uvc_parse_standard_control(struct uvc_device *dev, const unsigned char *buffer, int buflen) { struct usb_device *udev = dev->udev; struct uvc_entity *unit, *term; struct usb_interface *intf; struct usb_host_interface *alts = dev->intf->cur_altsetting; unsigned int i, n, p, len; const char *type_name; u16 type; switch (buffer[2]) { case UVC_VC_HEADER: n = buflen >= 12 ? buffer[11] : 0; if (buflen < 12 + n) { uvc_dbg(dev, DESCR, "device %d videocontrol interface %d HEADER error\n", udev->devnum, alts->desc.bInterfaceNumber); return -EINVAL; } dev->uvc_version = get_unaligned_le16(&buffer[3]); dev->clock_frequency = get_unaligned_le32(&buffer[7]); /* Parse all USB Video Streaming interfaces. */ for (i = 0; i < n; ++i) { intf = usb_ifnum_to_if(udev, buffer[12+i]); if (intf == NULL) { uvc_dbg(dev, DESCR, "device %d interface %d doesn't exists\n", udev->devnum, i); continue; } uvc_parse_streaming(dev, intf); } break; case UVC_VC_INPUT_TERMINAL: if (buflen < 8) { uvc_dbg(dev, DESCR, "device %d videocontrol interface %d INPUT_TERMINAL error\n", udev->devnum, alts->desc.bInterfaceNumber); return -EINVAL; } /* * Reject invalid terminal types that would cause issues: * * - The high byte must be non-zero, otherwise it would be * confused with a unit. * * - Bit 15 must be 0, as we use it internally as a terminal * direction flag. * * Other unknown types are accepted. */ type = get_unaligned_le16(&buffer[4]); if ((type & 0x7f00) == 0 || (type & 0x8000) != 0) { uvc_dbg(dev, DESCR, "device %d videocontrol interface %d INPUT_TERMINAL %d has invalid type 0x%04x, skipping\n", udev->devnum, alts->desc.bInterfaceNumber, buffer[3], type); return 0; } n = 0; p = 0; len = 8; if (type == UVC_ITT_CAMERA) { n = buflen >= 15 ? buffer[14] : 0; len = 15; } else if (type == UVC_ITT_MEDIA_TRANSPORT_INPUT) { n = buflen >= 9 ? buffer[8] : 0; p = buflen >= 10 + n ? buffer[9+n] : 0; len = 10; } if (buflen < len + n + p) { uvc_dbg(dev, DESCR, "device %d videocontrol interface %d INPUT_TERMINAL error\n", udev->devnum, alts->desc.bInterfaceNumber); return -EINVAL; } term = uvc_alloc_entity(type | UVC_TERM_INPUT, buffer[3], 1, n + p); if (term == NULL) return -ENOMEM; if (UVC_ENTITY_TYPE(term) == UVC_ITT_CAMERA) { term->camera.bControlSize = n; term->camera.bmControls = (u8 *)term + sizeof(*term); term->camera.wObjectiveFocalLengthMin = get_unaligned_le16(&buffer[8]); term->camera.wObjectiveFocalLengthMax = get_unaligned_le16(&buffer[10]); term->camera.wOcularFocalLength = get_unaligned_le16(&buffer[12]); memcpy(term->camera.bmControls, &buffer[15], n); } else if (UVC_ENTITY_TYPE(term) == UVC_ITT_MEDIA_TRANSPORT_INPUT) { term->media.bControlSize = n; term->media.bmControls = (u8 *)term + sizeof(*term); term->media.bTransportModeSize = p; term->media.bmTransportModes = (u8 *)term + sizeof(*term) + n; memcpy(term->media.bmControls, &buffer[9], n); memcpy(term->media.bmTransportModes, &buffer[10+n], p); } if (UVC_ENTITY_TYPE(term) == UVC_ITT_CAMERA) type_name = "Camera"; else if (UVC_ENTITY_TYPE(term) == UVC_ITT_MEDIA_TRANSPORT_INPUT) type_name = "Media"; else type_name = "Input"; uvc_entity_set_name(dev, term, type_name, buffer[7]); list_add_tail(&term->list, &dev->entities); break; case UVC_VC_OUTPUT_TERMINAL: if (buflen < 9) { uvc_dbg(dev, DESCR, "device %d videocontrol interface %d OUTPUT_TERMINAL error\n", udev->devnum, alts->desc.bInterfaceNumber); return -EINVAL; } /* * Make sure the terminal type MSB is not null, otherwise it * could be confused with a unit. */ type = get_unaligned_le16(&buffer[4]); if ((type & 0xff00) == 0) { uvc_dbg(dev, DESCR, "device %d videocontrol interface %d OUTPUT_TERMINAL %d has invalid type 0x%04x, skipping\n", udev->devnum, alts->desc.bInterfaceNumber, buffer[3], type); return 0; } term = uvc_alloc_entity(type | UVC_TERM_OUTPUT, buffer[3], 1, 0); if (term == NULL) return -ENOMEM; memcpy(term->baSourceID, &buffer[7], 1); uvc_entity_set_name(dev, term, "Output", buffer[8]); list_add_tail(&term->list, &dev->entities); break; case UVC_VC_SELECTOR_UNIT: p = buflen >= 5 ? buffer[4] : 0; if (buflen < 5 || buflen < 6 + p) { uvc_dbg(dev, DESCR, "device %d videocontrol interface %d SELECTOR_UNIT error\n", udev->devnum, alts->desc.bInterfaceNumber); return -EINVAL; } unit = uvc_alloc_entity(buffer[2], buffer[3], p + 1, 0); if (unit == NULL) return -ENOMEM; memcpy(unit->baSourceID, &buffer[5], p); uvc_entity_set_name(dev, unit, "Selector", buffer[5+p]); list_add_tail(&unit->list, &dev->entities); break; case UVC_VC_PROCESSING_UNIT: n = buflen >= 8 ? buffer[7] : 0; p = dev->uvc_version >= 0x0110 ? 10 : 9; if (buflen < p + n) { uvc_dbg(dev, DESCR, "device %d videocontrol interface %d PROCESSING_UNIT error\n", udev->devnum, alts->desc.bInterfaceNumber); return -EINVAL; } unit = uvc_alloc_entity(buffer[2], buffer[3], 2, n); if (unit == NULL) return -ENOMEM; memcpy(unit->baSourceID, &buffer[4], 1); unit->processing.wMaxMultiplier = get_unaligned_le16(&buffer[5]); unit->processing.bControlSize = buffer[7]; unit->processing.bmControls = (u8 *)unit + sizeof(*unit); memcpy(unit->processing.bmControls, &buffer[8], n); if (dev->uvc_version >= 0x0110) unit->processing.bmVideoStandards = buffer[9+n]; uvc_entity_set_name(dev, unit, "Processing", buffer[8+n]); list_add_tail(&unit->list, &dev->entities); break; case UVC_VC_EXTENSION_UNIT: p = buflen >= 22 ? buffer[21] : 0; n = buflen >= 24 + p ? buffer[22+p] : 0; if (buflen < 24 + p + n) { uvc_dbg(dev, DESCR, "device %d videocontrol interface %d EXTENSION_UNIT error\n", udev->devnum, alts->desc.bInterfaceNumber); return -EINVAL; } unit = uvc_alloc_entity(buffer[2], buffer[3], p + 1, n); if (unit == NULL) return -ENOMEM; memcpy(unit->guid, &buffer[4], 16); unit->extension.bNumControls = buffer[20]; memcpy(unit->baSourceID, &buffer[22], p); unit->extension.bControlSize = buffer[22+p]; unit->extension.bmControls = (u8 *)unit + sizeof(*unit); memcpy(unit->extension.bmControls, &buffer[23+p], n); uvc_entity_set_name(dev, unit, "Extension", buffer[23+p+n]); list_add_tail(&unit->list, &dev->entities); break; default: uvc_dbg(dev, DESCR, "Found an unknown CS_INTERFACE descriptor (%u)\n", buffer[2]); break; } return 0; } static int uvc_parse_control(struct uvc_device *dev) { struct usb_host_interface *alts = dev->intf->cur_altsetting; const unsigned char *buffer = alts->extra; int buflen = alts->extralen; int ret; /* * Parse the default alternate setting only, as the UVC specification * defines a single alternate setting, the default alternate setting * zero. */ while (buflen > 2) { if (uvc_parse_vendor_control(dev, buffer, buflen) || buffer[1] != USB_DT_CS_INTERFACE) goto next_descriptor; ret = uvc_parse_standard_control(dev, buffer, buflen); if (ret < 0) return ret; next_descriptor: buflen -= buffer[0]; buffer += buffer[0]; } /* * Check if the optional status endpoint is present. Built-in iSight * webcams have an interrupt endpoint but spit proprietary data that * don't conform to the UVC status endpoint messages. Don't try to * handle the interrupt endpoint for those cameras. */ if (alts->desc.bNumEndpoints == 1 && !(dev->quirks & UVC_QUIRK_BUILTIN_ISIGHT)) { struct usb_host_endpoint *ep = &alts->endpoint[0]; struct usb_endpoint_descriptor *desc = &ep->desc; if (usb_endpoint_is_int_in(desc) && le16_to_cpu(desc->wMaxPacketSize) >= 8 && desc->bInterval != 0) { uvc_dbg(dev, DESCR, "Found a Status endpoint (addr %02x)\n", desc->bEndpointAddress); dev->int_ep = ep; } } return 0; } /* ----------------------------------------------------------------------------- * Privacy GPIO */ static void uvc_gpio_event(struct uvc_device *dev) { struct uvc_entity *unit = dev->gpio_unit; struct uvc_video_chain *chain; u8 new_val; if (!unit) return; new_val = gpiod_get_value_cansleep(unit->gpio.gpio_privacy); /* GPIO entities are always on the first chain. */ chain = list_first_entry(&dev->chains, struct uvc_video_chain, list); uvc_ctrl_status_event(chain, unit->controls, &new_val); } static int uvc_gpio_get_cur(struct uvc_device *dev, struct uvc_entity *entity, u8 cs, void *data, u16 size) { if (cs != UVC_CT_PRIVACY_CONTROL || size < 1) return -EINVAL; *(u8 *)data = gpiod_get_value_cansleep(entity->gpio.gpio_privacy); return 0; } static int uvc_gpio_get_info(struct uvc_device *dev, struct uvc_entity *entity, u8 cs, u8 *caps) { if (cs != UVC_CT_PRIVACY_CONTROL) return -EINVAL; *caps = UVC_CONTROL_CAP_GET | UVC_CONTROL_CAP_AUTOUPDATE; return 0; } static irqreturn_t uvc_gpio_irq(int irq, void *data) { struct uvc_device *dev = data; uvc_gpio_event(dev); return IRQ_HANDLED; } static int uvc_gpio_parse(struct uvc_device *dev) { struct uvc_entity *unit; struct gpio_desc *gpio_privacy; int irq; gpio_privacy = devm_gpiod_get_optional(&dev->udev->dev, "privacy", GPIOD_IN); if (IS_ERR_OR_NULL(gpio_privacy)) return PTR_ERR_OR_ZERO(gpio_privacy); irq = gpiod_to_irq(gpio_privacy); if (irq < 0) return dev_err_probe(&dev->udev->dev, irq, "No IRQ for privacy GPIO\n"); unit = uvc_alloc_entity(UVC_EXT_GPIO_UNIT, UVC_EXT_GPIO_UNIT_ID, 0, 1); if (!unit) return -ENOMEM; unit->gpio.gpio_privacy = gpio_privacy; unit->gpio.irq = irq; unit->gpio.bControlSize = 1; unit->gpio.bmControls = (u8 *)unit + sizeof(*unit); unit->gpio.bmControls[0] = 1; unit->get_cur = uvc_gpio_get_cur; unit->get_info = uvc_gpio_get_info; strscpy(unit->name, "GPIO", sizeof(unit->name)); list_add_tail(&unit->list, &dev->entities); dev->gpio_unit = unit; return 0; } static int uvc_gpio_init_irq(struct uvc_device *dev) { struct uvc_entity *unit = dev->gpio_unit; if (!unit || unit->gpio.irq < 0) return 0; return devm_request_threaded_irq(&dev->udev->dev, unit->gpio.irq, NULL, uvc_gpio_irq, IRQF_ONESHOT | IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING, "uvc_privacy_gpio", dev); } /* ------------------------------------------------------------------------ * UVC device scan */ /* * Scan the UVC descriptors to locate a chain starting at an Output Terminal * and containing the following units: * * - one or more Output Terminals (USB Streaming or Display) * - zero or one Processing Unit * - zero, one or more single-input Selector Units * - zero or one multiple-input Selector Units, provided all inputs are * connected to input terminals * - zero, one or mode single-input Extension Units * - one or more Input Terminals (Camera, External or USB Streaming) * * The terminal and units must match on of the following structures: * * ITT_*(0) -> +---------+ +---------+ +---------+ -> TT_STREAMING(0) * ... | SU{0,1} | -> | PU{0,1} | -> | XU{0,n} | ... * ITT_*(n) -> +---------+ +---------+ +---------+ -> TT_STREAMING(n) * * +---------+ +---------+ -> OTT_*(0) * TT_STREAMING -> | PU{0,1} | -> | XU{0,n} | ... * +---------+ +---------+ -> OTT_*(n) * * The Processing Unit and Extension Units can be in any order. Additional * Extension Units connected to the main chain as single-unit branches are * also supported. Single-input Selector Units are ignored. */ static int uvc_scan_chain_entity(struct uvc_video_chain *chain, struct uvc_entity *entity) { switch (UVC_ENTITY_TYPE(entity)) { case UVC_VC_EXTENSION_UNIT: uvc_dbg_cont(PROBE, " <- XU %d", entity->id); if (entity->bNrInPins != 1) { uvc_dbg(chain->dev, DESCR, "Extension unit %d has more than 1 input pin\n", entity->id); return -1; } break; case UVC_VC_PROCESSING_UNIT: uvc_dbg_cont(PROBE, " <- PU %d", entity->id); if (chain->processing != NULL) { uvc_dbg(chain->dev, DESCR, "Found multiple Processing Units in chain\n"); return -1; } chain->processing = entity; break; case UVC_VC_SELECTOR_UNIT: uvc_dbg_cont(PROBE, " <- SU %d", entity->id); /* Single-input selector units are ignored. */ if (entity->bNrInPins == 1) break; if (chain->selector != NULL) { uvc_dbg(chain->dev, DESCR, "Found multiple Selector Units in chain\n"); return -1; } chain->selector = entity; break; case UVC_ITT_VENDOR_SPECIFIC: case UVC_ITT_CAMERA: case UVC_ITT_MEDIA_TRANSPORT_INPUT: uvc_dbg_cont(PROBE, " <- IT %d\n", entity->id); break; case UVC_OTT_VENDOR_SPECIFIC: case UVC_OTT_DISPLAY: case UVC_OTT_MEDIA_TRANSPORT_OUTPUT: uvc_dbg_cont(PROBE, " OT %d", entity->id); break; case UVC_TT_STREAMING: if (UVC_ENTITY_IS_ITERM(entity)) uvc_dbg_cont(PROBE, " <- IT %d\n", entity->id); else uvc_dbg_cont(PROBE, " OT %d", entity->id); break; default: uvc_dbg(chain->dev, DESCR, "Unsupported entity type 0x%04x found in chain\n", UVC_ENTITY_TYPE(entity)); return -1; } list_add_tail(&entity->chain, &chain->entities); return 0; } static int uvc_scan_chain_forward(struct uvc_video_chain *chain, struct uvc_entity *entity, struct uvc_entity *prev) { struct uvc_entity *forward; int found; /* Forward scan */ forward = NULL; found = 0; while (1) { forward = uvc_entity_by_reference(chain->dev, entity->id, forward); if (forward == NULL) break; if (forward == prev) continue; if (forward->chain.next || forward->chain.prev) { uvc_dbg(chain->dev, DESCR, "Found reference to entity %d already in chain\n", forward->id); return -EINVAL; } switch (UVC_ENTITY_TYPE(forward)) { case UVC_VC_EXTENSION_UNIT: if (forward->bNrInPins != 1) { uvc_dbg(chain->dev, DESCR, "Extension unit %d has more than 1 input pin\n", forward->id); return -EINVAL; } /* * Some devices reference an output terminal as the * source of extension units. This is incorrect, as * output terminals only have an input pin, and thus * can't be connected to any entity in the forward * direction. The resulting topology would cause issues * when registering the media controller graph. To * avoid this problem, connect the extension unit to * the source of the output terminal instead. */ if (UVC_ENTITY_IS_OTERM(entity)) { struct uvc_entity *source; source = uvc_entity_by_id(chain->dev, entity->baSourceID[0]); if (!source) { uvc_dbg(chain->dev, DESCR, "Can't connect extension unit %u in chain\n", forward->id); break; } forward->baSourceID[0] = source->id; } list_add_tail(&forward->chain, &chain->entities); if (!found) uvc_dbg_cont(PROBE, " (->"); uvc_dbg_cont(PROBE, " XU %d", forward->id); found = 1; break; case UVC_OTT_VENDOR_SPECIFIC: case UVC_OTT_DISPLAY: case UVC_OTT_MEDIA_TRANSPORT_OUTPUT: case UVC_TT_STREAMING: if (UVC_ENTITY_IS_ITERM(forward)) { uvc_dbg(chain->dev, DESCR, "Unsupported input terminal %u\n", forward->id); return -EINVAL; } if (UVC_ENTITY_IS_OTERM(entity)) { uvc_dbg(chain->dev, DESCR, "Unsupported connection between output terminals %u and %u\n", entity->id, forward->id); break; } list_add_tail(&forward->chain, &chain->entities); if (!found) uvc_dbg_cont(PROBE, " (->"); uvc_dbg_cont(PROBE, " OT %d", forward->id); found = 1; break; } } if (found) uvc_dbg_cont(PROBE, ")"); return 0; } static int uvc_scan_chain_backward(struct uvc_video_chain *chain, struct uvc_entity **_entity) { struct uvc_entity *entity = *_entity; struct uvc_entity *term; int id = -EINVAL, i; switch (UVC_ENTITY_TYPE(entity)) { case UVC_VC_EXTENSION_UNIT: case UVC_VC_PROCESSING_UNIT: id = entity->baSourceID[0]; break; case UVC_VC_SELECTOR_UNIT: /* Single-input selector units are ignored. */ if (entity->bNrInPins == 1) { id = entity->baSourceID[0]; break; } uvc_dbg_cont(PROBE, " <- IT"); chain->selector = entity; for (i = 0; i < entity->bNrInPins; ++i) { id = entity->baSourceID[i]; term = uvc_entity_by_id(chain->dev, id); if (term == NULL || !UVC_ENTITY_IS_ITERM(term)) { uvc_dbg(chain->dev, DESCR, "Selector unit %d input %d isn't connected to an input terminal\n", entity->id, i); return -1; } if (term->chain.next || term->chain.prev) { uvc_dbg(chain->dev, DESCR, "Found reference to entity %d already in chain\n", term->id); return -EINVAL; } uvc_dbg_cont(PROBE, " %d", term->id); list_add_tail(&term->chain, &chain->entities); uvc_scan_chain_forward(chain, term, entity); } uvc_dbg_cont(PROBE, "\n"); id = 0; break; case UVC_ITT_VENDOR_SPECIFIC: case UVC_ITT_CAMERA: case UVC_ITT_MEDIA_TRANSPORT_INPUT: case UVC_OTT_VENDOR_SPECIFIC: case UVC_OTT_DISPLAY: case UVC_OTT_MEDIA_TRANSPORT_OUTPUT: case UVC_TT_STREAMING: id = UVC_ENTITY_IS_OTERM(entity) ? entity->baSourceID[0] : 0; break; } if (id <= 0) { *_entity = NULL; return id; } entity = uvc_entity_by_id(chain->dev, id); if (entity == NULL) { uvc_dbg(chain->dev, DESCR, "Found reference to unknown entity %d\n", id); return -EINVAL; } *_entity = entity; return 0; } static int uvc_scan_chain(struct uvc_video_chain *chain, struct uvc_entity *term) { struct uvc_entity *entity, *prev; uvc_dbg(chain->dev, PROBE, "Scanning UVC chain:"); entity = term; prev = NULL; while (entity != NULL) { /* Entity must not be part of an existing chain */ if (entity->chain.next || entity->chain.prev) { uvc_dbg(chain->dev, DESCR, "Found reference to entity %d already in chain\n", entity->id); return -EINVAL; } /* Process entity */ if (uvc_scan_chain_entity(chain, entity) < 0) return -EINVAL; /* Forward scan */ if (uvc_scan_chain_forward(chain, entity, prev) < 0) return -EINVAL; /* Backward scan */ prev = entity; if (uvc_scan_chain_backward(chain, &entity) < 0) return -EINVAL; } return 0; } static unsigned int uvc_print_terms(struct list_head *terms, u16 dir, char *buffer) { struct uvc_entity *term; unsigned int nterms = 0; char *p = buffer; list_for_each_entry(term, terms, chain) { if (!UVC_ENTITY_IS_TERM(term) || UVC_TERM_DIRECTION(term) != dir) continue; if (nterms) p += sprintf(p, ","); if (++nterms >= 4) { p += sprintf(p, "..."); break; } p += sprintf(p, "%u", term->id); } return p - buffer; } static const char *uvc_print_chain(struct uvc_video_chain *chain) { static char buffer[43]; char *p = buffer; p += uvc_print_terms(&chain->entities, UVC_TERM_INPUT, p); p += sprintf(p, " -> "); uvc_print_terms(&chain->entities, UVC_TERM_OUTPUT, p); return buffer; } static struct uvc_video_chain *uvc_alloc_chain(struct uvc_device *dev) { struct uvc_video_chain *chain; chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (chain == NULL) return NULL; INIT_LIST_HEAD(&chain->entities); mutex_init(&chain->ctrl_mutex); chain->dev = dev; v4l2_prio_init(&chain->prio); return chain; } /* * Fallback heuristic for devices that don't connect units and terminals in a * valid chain. * * Some devices have invalid baSourceID references, causing uvc_scan_chain() * to fail, but if we just take the entities we can find and put them together * in the most sensible chain we can think of, turns out they do work anyway. * Note: This heuristic assumes there is a single chain. * * At the time of writing, devices known to have such a broken chain are * - Acer Integrated Camera (5986:055a) * - Realtek rtl157a7 (0bda:57a7) */ static int uvc_scan_fallback(struct uvc_device *dev) { struct uvc_video_chain *chain; struct uvc_entity *iterm = NULL; struct uvc_entity *oterm = NULL; struct uvc_entity *entity; struct uvc_entity *prev; /* * Start by locating the input and output terminals. We only support * devices with exactly one of each for now. */ list_for_each_entry(entity, &dev->entities, list) { if (UVC_ENTITY_IS_ITERM(entity)) { if (iterm) return -EINVAL; iterm = entity; } if (UVC_ENTITY_IS_OTERM(entity)) { if (oterm) return -EINVAL; oterm = entity; } } if (iterm == NULL || oterm == NULL) return -EINVAL; /* Allocate the chain and fill it. */ chain = uvc_alloc_chain(dev); if (chain == NULL) return -ENOMEM; if (uvc_scan_chain_entity(chain, oterm) < 0) goto error; prev = oterm; /* * Add all Processing and Extension Units with two pads. The order * doesn't matter much, use reverse list traversal to connect units in * UVC descriptor order as we build the chain from output to input. This * leads to units appearing in the order meant by the manufacturer for * the cameras known to require this heuristic. */ list_for_each_entry_reverse(entity, &dev->entities, list) { if (entity->type != UVC_VC_PROCESSING_UNIT && entity->type != UVC_VC_EXTENSION_UNIT) continue; if (entity->num_pads != 2) continue; if (uvc_scan_chain_entity(chain, entity) < 0) goto error; prev->baSourceID[0] = entity->id; prev = entity; } if (uvc_scan_chain_entity(chain, iterm) < 0) goto error; prev->baSourceID[0] = iterm->id; list_add_tail(&chain->list, &dev->chains); uvc_dbg(dev, PROBE, "Found a video chain by fallback heuristic (%s)\n", uvc_print_chain(chain)); return 0; error: kfree(chain); return -EINVAL; } /* * Scan the device for video chains and register video devices. * * Chains are scanned starting at their output terminals and walked backwards. */ static int uvc_scan_device(struct uvc_device *dev) { struct uvc_video_chain *chain; struct uvc_entity *term; list_for_each_entry(term, &dev->entities, list) { if (!UVC_ENTITY_IS_OTERM(term)) continue; /* * If the terminal is already included in a chain, skip it. * This can happen for chains that have multiple output * terminals, where all output terminals beside the first one * will be inserted in the chain in forward scans. */ if (term->chain.next || term->chain.prev) continue; chain = uvc_alloc_chain(dev); if (chain == NULL) return -ENOMEM; term->flags |= UVC_ENTITY_FLAG_DEFAULT; if (uvc_scan_chain(chain, term) < 0) { kfree(chain); continue; } uvc_dbg(dev, PROBE, "Found a valid video chain (%s)\n", uvc_print_chain(chain)); list_add_tail(&chain->list, &dev->chains); } if (list_empty(&dev->chains)) uvc_scan_fallback(dev); if (list_empty(&dev->chains)) { dev_info(&dev->udev->dev, "No valid video chain found.\n"); return -1; } /* Add GPIO entity to the first chain. */ if (dev->gpio_unit) { chain = list_first_entry(&dev->chains, struct uvc_video_chain, list); list_add_tail(&dev->gpio_unit->chain, &chain->entities); } return 0; } /* ------------------------------------------------------------------------ * Video device registration and unregistration */ /* * Delete the UVC device. * * Called by the kernel when the last reference to the uvc_device structure * is released. * * As this function is called after or during disconnect(), all URBs have * already been cancelled by the USB core. There is no need to kill the * interrupt URB manually. */ static void uvc_delete(struct kref *kref) { struct uvc_device *dev = container_of(kref, struct uvc_device, ref); struct list_head *p, *n; uvc_status_cleanup(dev); uvc_ctrl_cleanup_device(dev); usb_put_intf(dev->intf); usb_put_dev(dev->udev); #ifdef CONFIG_MEDIA_CONTROLLER media_device_cleanup(&dev->mdev); #endif list_for_each_safe(p, n, &dev->chains) { struct uvc_video_chain *chain; chain = list_entry(p, struct uvc_video_chain, list); kfree(chain); } list_for_each_safe(p, n, &dev->entities) { struct uvc_entity *entity; entity = list_entry(p, struct uvc_entity, list); #ifdef CONFIG_MEDIA_CONTROLLER uvc_mc_cleanup_entity(entity); #endif kfree(entity); } list_for_each_safe(p, n, &dev->streams) { struct uvc_streaming *streaming; streaming = list_entry(p, struct uvc_streaming, list); usb_driver_release_interface(&uvc_driver.driver, streaming->intf); uvc_stream_delete(streaming); } kfree(dev); } static void uvc_release(struct video_device *vdev) { struct uvc_streaming *stream = video_get_drvdata(vdev); struct uvc_device *dev = stream->dev; kref_put(&dev->ref, uvc_delete); } /* * Unregister the video devices. */ static void uvc_unregister_video(struct uvc_device *dev) { struct uvc_streaming *stream; list_for_each_entry(stream, &dev->streams, list) { if (!video_is_registered(&stream->vdev)) continue; video_unregister_device(&stream->vdev); video_unregister_device(&stream->meta.vdev); uvc_debugfs_cleanup_stream(stream); } uvc_status_unregister(dev); if (dev->vdev.dev) v4l2_device_unregister(&dev->vdev); #ifdef CONFIG_MEDIA_CONTROLLER if (media_devnode_is_registered(dev->mdev.devnode)) media_device_unregister(&dev->mdev); #endif } int uvc_register_video_device(struct uvc_device *dev, struct uvc_streaming *stream, struct video_device *vdev, struct uvc_video_queue *queue, enum v4l2_buf_type type, const struct v4l2_file_operations *fops, const struct v4l2_ioctl_ops *ioctl_ops) { int ret; /* Initialize the video buffers queue. */ ret = uvc_queue_init(queue, type, !uvc_no_drop_param); if (ret) return ret; /* Register the device with V4L. */ /* * We already hold a reference to dev->udev. The video device will be * unregistered before the reference is released, so we don't need to * get another one. */ vdev->v4l2_dev = &dev->vdev; vdev->fops = fops; vdev->ioctl_ops = ioctl_ops; vdev->release = uvc_release; vdev->prio = &stream->chain->prio; if (type == V4L2_BUF_TYPE_VIDEO_OUTPUT) vdev->vfl_dir = VFL_DIR_TX; else vdev->vfl_dir = VFL_DIR_RX; switch (type) { case V4L2_BUF_TYPE_VIDEO_CAPTURE: default: vdev->device_caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_STREAMING; break; case V4L2_BUF_TYPE_VIDEO_OUTPUT: vdev->device_caps = V4L2_CAP_VIDEO_OUTPUT | V4L2_CAP_STREAMING; break; case V4L2_BUF_TYPE_META_CAPTURE: vdev->device_caps = V4L2_CAP_META_CAPTURE | V4L2_CAP_STREAMING; break; } strscpy(vdev->name, dev->name, sizeof(vdev->name)); /* * Set the driver data before calling video_register_device, otherwise * the file open() handler might race us. */ video_set_drvdata(vdev, stream); ret = video_register_device(vdev, VFL_TYPE_VIDEO, -1); if (ret < 0) { dev_err(&stream->intf->dev, "Failed to register %s device (%d).\n", v4l2_type_names[type], ret); return ret; } kref_get(&dev->ref); return 0; } static int uvc_register_video(struct uvc_device *dev, struct uvc_streaming *stream) { int ret; /* Initialize the streaming interface with default parameters. */ ret = uvc_video_init(stream); if (ret < 0) { dev_err(&stream->intf->dev, "Failed to initialize the device (%d).\n", ret); return ret; } if (stream->type == V4L2_BUF_TYPE_VIDEO_CAPTURE) stream->chain->caps |= V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_META_CAPTURE; else stream->chain->caps |= V4L2_CAP_VIDEO_OUTPUT; uvc_debugfs_init_stream(stream); /* Register the device with V4L. */ return uvc_register_video_device(dev, stream, &stream->vdev, &stream->queue, stream->type, &uvc_fops, &uvc_ioctl_ops); } /* * Register all video devices in all chains. */ static int uvc_register_terms(struct uvc_device *dev, struct uvc_video_chain *chain) { struct uvc_streaming *stream; struct uvc_entity *term; int ret; list_for_each_entry(term, &chain->entities, chain) { if (UVC_ENTITY_TYPE(term) != UVC_TT_STREAMING) continue; stream = uvc_stream_by_id(dev, term->id); if (stream == NULL) { dev_info(&dev->udev->dev, "No streaming interface found for terminal %u.", term->id); continue; } stream->chain = chain; ret = uvc_register_video(dev, stream); if (ret < 0) return ret; /* * Register a metadata node, but ignore a possible failure, * complete registration of video nodes anyway. */ uvc_meta_register(stream); term->vdev = &stream->vdev; } return 0; } static int uvc_register_chains(struct uvc_device *dev) { struct uvc_video_chain *chain; int ret; list_for_each_entry(chain, &dev->chains, list) { ret = uvc_register_terms(dev, chain); if (ret < 0) return ret; #ifdef CONFIG_MEDIA_CONTROLLER ret = uvc_mc_register_entities(chain); if (ret < 0) dev_info(&dev->udev->dev, "Failed to register entities (%d).\n", ret); #endif } return 0; } /* ------------------------------------------------------------------------ * USB probe, disconnect, suspend and resume */ static const struct uvc_device_info uvc_quirk_none = { 0 }; static int uvc_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev(intf); struct uvc_device *dev; const struct uvc_device_info *info = (const struct uvc_device_info *)id->driver_info; int function; int ret; /* Allocate memory for the device and initialize it. */ dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (dev == NULL) return -ENOMEM; INIT_LIST_HEAD(&dev->entities); INIT_LIST_HEAD(&dev->chains); INIT_LIST_HEAD(&dev->streams); kref_init(&dev->ref); atomic_set(&dev->nmappings, 0); mutex_init(&dev->lock); dev->udev = usb_get_dev(udev); dev->intf = usb_get_intf(intf); dev->intfnum = intf->cur_altsetting->desc.bInterfaceNumber; dev->info = info ? info : &uvc_quirk_none; dev->quirks = uvc_quirks_param == -1 ? dev->info->quirks : uvc_quirks_param; if (id->idVendor && id->idProduct) uvc_dbg(dev, PROBE, "Probing known UVC device %s (%04x:%04x)\n", udev->devpath, id->idVendor, id->idProduct); else uvc_dbg(dev, PROBE, "Probing generic UVC device %s\n", udev->devpath); if (udev->product != NULL) strscpy(dev->name, udev->product, sizeof(dev->name)); else snprintf(dev->name, sizeof(dev->name), "UVC Camera (%04x:%04x)", le16_to_cpu(udev->descriptor.idVendor), le16_to_cpu(udev->descriptor.idProduct)); /* * Add iFunction or iInterface to names when available as additional * distinguishers between interfaces. iFunction is prioritized over * iInterface which matches Windows behavior at the point of writing. */ if (intf->intf_assoc && intf->intf_assoc->iFunction != 0) function = intf->intf_assoc->iFunction; else function = intf->cur_altsetting->desc.iInterface; if (function != 0) { size_t len; strlcat(dev->name, ": ", sizeof(dev->name)); len = strlen(dev->name); usb_string(udev, function, dev->name + len, sizeof(dev->name) - len); } /* Initialize the media device. */ #ifdef CONFIG_MEDIA_CONTROLLER dev->mdev.dev = &intf->dev; strscpy(dev->mdev.model, dev->name, sizeof(dev->mdev.model)); if (udev->serial) strscpy(dev->mdev.serial, udev->serial, sizeof(dev->mdev.serial)); usb_make_path(udev, dev->mdev.bus_info, sizeof(dev->mdev.bus_info)); dev->mdev.hw_revision = le16_to_cpu(udev->descriptor.bcdDevice); media_device_init(&dev->mdev); dev->vdev.mdev = &dev->mdev; #endif /* Parse the Video Class control descriptor. */ if (uvc_parse_control(dev) < 0) { uvc_dbg(dev, PROBE, "Unable to parse UVC descriptors\n"); goto error; } /* Parse the associated GPIOs. */ if (uvc_gpio_parse(dev) < 0) { uvc_dbg(dev, PROBE, "Unable to parse UVC GPIOs\n"); goto error; } dev_info(&dev->udev->dev, "Found UVC %u.%02x device %s (%04x:%04x)\n", dev->uvc_version >> 8, dev->uvc_version & 0xff, udev->product ? udev->product : "<unnamed>", le16_to_cpu(udev->descriptor.idVendor), le16_to_cpu(udev->descriptor.idProduct)); if (dev->quirks != dev->info->quirks) { dev_info(&dev->udev->dev, "Forcing device quirks to 0x%x by module parameter for testing purpose.\n", dev->quirks); dev_info(&dev->udev->dev, "Please report required quirks to the linux-media mailing list.\n"); } if (dev->info->uvc_version) { dev->uvc_version = dev->info->uvc_version; dev_info(&dev->udev->dev, "Forcing UVC version to %u.%02x\n", dev->uvc_version >> 8, dev->uvc_version & 0xff); } /* Register the V4L2 device. */ if (v4l2_device_register(&intf->dev, &dev->vdev) < 0) goto error; /* Scan the device for video chains. */ if (uvc_scan_device(dev) < 0) goto error; /* Initialize controls. */ if (uvc_ctrl_init_device(dev) < 0) goto error; /* Register video device nodes. */ if (uvc_register_chains(dev) < 0) goto error; #ifdef CONFIG_MEDIA_CONTROLLER /* Register the media device node */ if (media_device_register(&dev->mdev) < 0) goto error; #endif /* Save our data pointer in the interface data. */ usb_set_intfdata(intf, dev); /* Initialize the interrupt URB. */ ret = uvc_status_init(dev); if (ret < 0) { dev_info(&dev->udev->dev, "Unable to initialize the status endpoint (%d), status interrupt will not be supported.\n", ret); } ret = uvc_gpio_init_irq(dev); if (ret < 0) { dev_err(&dev->udev->dev, "Unable to request privacy GPIO IRQ (%d)\n", ret); goto error; } uvc_dbg(dev, PROBE, "UVC device initialized\n"); usb_enable_autosuspend(udev); return 0; error: uvc_unregister_video(dev); kref_put(&dev->ref, uvc_delete); return -ENODEV; } static void uvc_disconnect(struct usb_interface *intf) { struct uvc_device *dev = usb_get_intfdata(intf); /* * Set the USB interface data to NULL. This can be done outside the * lock, as there's no other reader. */ usb_set_intfdata(intf, NULL); if (intf->cur_altsetting->desc.bInterfaceSubClass == UVC_SC_VIDEOSTREAMING) return; uvc_unregister_video(dev); kref_put(&dev->ref, uvc_delete); } static int uvc_suspend(struct usb_interface *intf, pm_message_t message) { struct uvc_device *dev = usb_get_intfdata(intf); struct uvc_streaming *stream; uvc_dbg(dev, SUSPEND, "Suspending interface %u\n", intf->cur_altsetting->desc.bInterfaceNumber); /* Controls are cached on the fly so they don't need to be saved. */ if (intf->cur_altsetting->desc.bInterfaceSubClass == UVC_SC_VIDEOCONTROL) { mutex_lock(&dev->lock); if (dev->users) uvc_status_stop(dev); mutex_unlock(&dev->lock); return 0; } list_for_each_entry(stream, &dev->streams, list) { if (stream->intf == intf) return uvc_video_suspend(stream); } uvc_dbg(dev, SUSPEND, "Suspend: video streaming USB interface mismatch\n"); return -EINVAL; } static int __uvc_resume(struct usb_interface *intf, int reset) { struct uvc_device *dev = usb_get_intfdata(intf); struct uvc_streaming *stream; int ret = 0; uvc_dbg(dev, SUSPEND, "Resuming interface %u\n", intf->cur_altsetting->desc.bInterfaceNumber); if (intf->cur_altsetting->desc.bInterfaceSubClass == UVC_SC_VIDEOCONTROL) { if (reset) { ret = uvc_ctrl_restore_values(dev); if (ret < 0) return ret; } mutex_lock(&dev->lock); if (dev->users) ret = uvc_status_start(dev, GFP_NOIO); mutex_unlock(&dev->lock); return ret; } list_for_each_entry(stream, &dev->streams, list) { if (stream->intf == intf) { ret = uvc_video_resume(stream, reset); if (ret < 0) uvc_queue_streamoff(&stream->queue, stream->queue.queue.type); return ret; } } uvc_dbg(dev, SUSPEND, "Resume: video streaming USB interface mismatch\n"); return -EINVAL; } static int uvc_resume(struct usb_interface *intf) { return __uvc_resume(intf, 0); } static int uvc_reset_resume(struct usb_interface *intf) { return __uvc_resume(intf, 1); } /* ------------------------------------------------------------------------ * Module parameters */ static int uvc_clock_param_get(char *buffer, const struct kernel_param *kp) { if (uvc_clock_param == CLOCK_MONOTONIC) return sprintf(buffer, "CLOCK_MONOTONIC"); else return sprintf(buffer, "CLOCK_REALTIME"); } static int uvc_clock_param_set(const char *val, const struct kernel_param *kp) { if (strncasecmp(val, "clock_", strlen("clock_")) == 0) val += strlen("clock_"); if (strcasecmp(val, "monotonic") == 0) uvc_clock_param = CLOCK_MONOTONIC; else if (strcasecmp(val, "realtime") == 0) uvc_clock_param = CLOCK_REALTIME; else return -EINVAL; return 0; } module_param_call(clock, uvc_clock_param_set, uvc_clock_param_get, &uvc_clock_param, 0644); MODULE_PARM_DESC(clock, "Video buffers timestamp clock"); module_param_named(hwtimestamps, uvc_hw_timestamps_param, uint, 0644); MODULE_PARM_DESC(hwtimestamps, "Use hardware timestamps"); module_param_named(nodrop, uvc_no_drop_param, uint, 0644); MODULE_PARM_DESC(nodrop, "Don't drop incomplete frames"); module_param_named(quirks, uvc_quirks_param, uint, 0644); MODULE_PARM_DESC(quirks, "Forced device quirks"); module_param_named(trace, uvc_dbg_param, uint, 0644); MODULE_PARM_DESC(trace, "Trace level bitmask"); module_param_named(timeout, uvc_timeout_param, uint, 0644); MODULE_PARM_DESC(timeout, "Streaming control requests timeout"); /* ------------------------------------------------------------------------ * Driver initialization and cleanup */ static const struct uvc_device_info uvc_ctrl_power_line_limited = { .mappings = (const struct uvc_control_mapping *[]) { &uvc_ctrl_power_line_mapping_limited, NULL, /* Sentinel */ }, }; static const struct uvc_device_info uvc_ctrl_power_line_uvc11 = { .mappings = (const struct uvc_control_mapping *[]) { &uvc_ctrl_power_line_mapping_uvc11, NULL, /* Sentinel */ }, }; static const struct uvc_device_info uvc_quirk_probe_minmax = { .quirks = UVC_QUIRK_PROBE_MINMAX, }; static const struct uvc_device_info uvc_quirk_fix_bandwidth = { .quirks = UVC_QUIRK_FIX_BANDWIDTH, }; static const struct uvc_device_info uvc_quirk_probe_def = { .quirks = UVC_QUIRK_PROBE_DEF, }; static const struct uvc_device_info uvc_quirk_stream_no_fid = { .quirks = UVC_QUIRK_STREAM_NO_FID, }; static const struct uvc_device_info uvc_quirk_force_y8 = { .quirks = UVC_QUIRK_FORCE_Y8, }; #define UVC_INFO_QUIRK(q) (kernel_ulong_t)&(struct uvc_device_info){.quirks = q} #define UVC_INFO_META(m) (kernel_ulong_t)&(struct uvc_device_info) \ {.meta_format = m} /* * The Logitech cameras listed below have their interface class set to * VENDOR_SPEC because they don't announce themselves as UVC devices, even * though they are compliant. */ static const struct usb_device_id uvc_ids[] = { /* Quanta USB2.0 HD UVC Webcam */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x0408, .idProduct = 0x3090, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_ctrl_power_line_limited }, /* Quanta USB2.0 HD UVC Webcam */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x0408, .idProduct = 0x4030, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_ctrl_power_line_limited }, /* Quanta USB2.0 HD UVC Webcam */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x0408, .idProduct = 0x4034, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = UVC_PC_PROTOCOL_15, .driver_info = (kernel_ulong_t)&uvc_ctrl_power_line_limited }, /* LogiLink Wireless Webcam */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x0416, .idProduct = 0xa91a, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_minmax }, /* Genius eFace 2025 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x0458, .idProduct = 0x706e, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_minmax }, /* Microsoft Lifecam NX-6000 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x045e, .idProduct = 0x00f8, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_minmax }, /* Microsoft Lifecam NX-3000 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x045e, .idProduct = 0x0721, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_def }, /* Microsoft Lifecam VX-7000 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x045e, .idProduct = 0x0723, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_minmax }, /* Logitech, Webcam C910 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x046d, .idProduct = 0x0821, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_QUIRK(UVC_QUIRK_WAKE_AUTOSUSPEND)}, /* Logitech, Webcam B910 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x046d, .idProduct = 0x0823, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_QUIRK(UVC_QUIRK_WAKE_AUTOSUSPEND)}, /* Logitech Quickcam Fusion */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x046d, .idProduct = 0x08c1, .bInterfaceClass = USB_CLASS_VENDOR_SPEC, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0 }, /* Logitech Quickcam Orbit MP */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x046d, .idProduct = 0x08c2, .bInterfaceClass = USB_CLASS_VENDOR_SPEC, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0 }, /* Logitech Quickcam Pro for Notebook */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x046d, .idProduct = 0x08c3, .bInterfaceClass = USB_CLASS_VENDOR_SPEC, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0 }, /* Logitech Quickcam Pro 5000 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x046d, .idProduct = 0x08c5, .bInterfaceClass = USB_CLASS_VENDOR_SPEC, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0 }, /* Logitech Quickcam OEM Dell Notebook */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x046d, .idProduct = 0x08c6, .bInterfaceClass = USB_CLASS_VENDOR_SPEC, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0 }, /* Logitech Quickcam OEM Cisco VT Camera II */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x046d, .idProduct = 0x08c7, .bInterfaceClass = USB_CLASS_VENDOR_SPEC, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0 }, /* Logitech HD Pro Webcam C920 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x046d, .idProduct = 0x082d, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_QUIRK(UVC_QUIRK_RESTORE_CTRLS_ON_INIT) }, /* Chicony CNF7129 (Asus EEE 100HE) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x04f2, .idProduct = 0xb071, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_QUIRK(UVC_QUIRK_RESTRICT_FRAME_RATE) }, /* Chicony EasyCamera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x04f2, .idProduct = 0xb5eb, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_ctrl_power_line_limited }, /* Chicony Electronics Co., Ltd Integrated Camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x04f2, .idProduct = 0xb67c, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = UVC_PC_PROTOCOL_15, .driver_info = (kernel_ulong_t)&uvc_ctrl_power_line_uvc11 }, /* Chicony EasyCamera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x04f2, .idProduct = 0xb6ba, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_ctrl_power_line_limited }, /* Chicony EasyCamera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x04f2, .idProduct = 0xb746, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_ctrl_power_line_limited }, /* Alcor Micro AU3820 (Future Boy PC USB Webcam) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x058f, .idProduct = 0x3820, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_minmax }, /* Dell XPS m1530 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x05a9, .idProduct = 0x2640, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_def }, /* Dell SP2008WFP Monitor */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x05a9, .idProduct = 0x2641, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_def }, /* Dell Alienware X51 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x05a9, .idProduct = 0x2643, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_def }, /* Dell Studio Hybrid 140g (OmniVision webcam) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x05a9, .idProduct = 0x264a, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_def }, /* Dell XPS M1330 (OmniVision OV7670 webcam) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x05a9, .idProduct = 0x7670, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_def }, /* Apple Built-In iSight */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x05ac, .idProduct = 0x8501, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_QUIRK(UVC_QUIRK_PROBE_MINMAX | UVC_QUIRK_BUILTIN_ISIGHT) }, /* Apple FaceTime HD Camera (Built-In) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x05ac, .idProduct = 0x8514, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_def }, /* Apple Built-In iSight via iBridge */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x05ac, .idProduct = 0x8600, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_def }, /* Foxlink ("HP Webcam" on HP Mini 5103) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x05c8, .idProduct = 0x0403, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_fix_bandwidth }, /* Genesys Logic USB 2.0 PC Camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x05e3, .idProduct = 0x0505, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_stream_no_fid }, /* Hercules Classic Silver */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x06f8, .idProduct = 0x300c, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_fix_bandwidth }, /* ViMicro Vega */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x0ac8, .idProduct = 0x332d, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_fix_bandwidth }, /* ViMicro - Minoru3D */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x0ac8, .idProduct = 0x3410, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_fix_bandwidth }, /* ViMicro Venus - Minoru3D */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x0ac8, .idProduct = 0x3420, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_fix_bandwidth }, /* Ophir Optronics - SPCAM 620U */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x0bd3, .idProduct = 0x0555, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_minmax }, /* MT6227 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x0e8d, .idProduct = 0x0004, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_QUIRK(UVC_QUIRK_PROBE_MINMAX | UVC_QUIRK_PROBE_DEF) }, /* IMC Networks (Medion Akoya) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x13d3, .idProduct = 0x5103, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_stream_no_fid }, /* JMicron USB2.0 XGA WebCam */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x152d, .idProduct = 0x0310, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_minmax }, /* Syntek (HP Spartan) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x174f, .idProduct = 0x5212, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_stream_no_fid }, /* Syntek (Samsung Q310) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x174f, .idProduct = 0x5931, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_stream_no_fid }, /* Syntek (Packard Bell EasyNote MX52 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x174f, .idProduct = 0x8a12, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_stream_no_fid }, /* Syntek (Asus F9SG) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x174f, .idProduct = 0x8a31, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_stream_no_fid }, /* Syntek (Asus U3S) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x174f, .idProduct = 0x8a33, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_stream_no_fid }, /* Syntek (JAOtech Smart Terminal) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x174f, .idProduct = 0x8a34, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_stream_no_fid }, /* Miricle 307K */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x17dc, .idProduct = 0x0202, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_stream_no_fid }, /* Lenovo Thinkpad SL400/SL500 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x17ef, .idProduct = 0x480b, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_stream_no_fid }, /* Aveo Technology USB 2.0 Camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x1871, .idProduct = 0x0306, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_QUIRK(UVC_QUIRK_PROBE_MINMAX | UVC_QUIRK_PROBE_EXTRAFIELDS) }, /* Aveo Technology USB 2.0 Camera (Tasco USB Microscope) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x1871, .idProduct = 0x0516, .bInterfaceClass = USB_CLASS_VENDOR_SPEC, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0 }, /* Ecamm Pico iMage */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x18cd, .idProduct = 0xcafe, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_QUIRK(UVC_QUIRK_PROBE_EXTRAFIELDS) }, /* Manta MM-353 Plako */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x18ec, .idProduct = 0x3188, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_minmax }, /* FSC WebCam V30S */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x18ec, .idProduct = 0x3288, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_minmax }, /* Arkmicro unbranded */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x18ec, .idProduct = 0x3290, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_def }, /* The Imaging Source USB CCD cameras */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x199e, .idProduct = 0x8102, .bInterfaceClass = USB_CLASS_VENDOR_SPEC, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0 }, /* Bodelin ProScopeHR */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_DEV_HI | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x19ab, .idProduct = 0x1000, .bcdDevice_hi = 0x0126, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_QUIRK(UVC_QUIRK_STATUS_INTERVAL) }, /* MSI StarCam 370i */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x1b3b, .idProduct = 0x2951, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_minmax }, /* Generalplus Technology Inc. 808 Camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x1b3f, .idProduct = 0x2002, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_minmax }, /* Shenzhen Aoni Electronic Co.,Ltd 2K FHD camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x1bcf, .idProduct = 0x0b40, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&(const struct uvc_device_info){ .uvc_version = 0x010a, } }, /* SiGma Micro USB Web Camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x1c4f, .idProduct = 0x3000, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_QUIRK(UVC_QUIRK_PROBE_MINMAX | UVC_QUIRK_IGNORE_SELECTOR_UNIT) }, /* Oculus VR Positional Tracker DK2 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x2833, .idProduct = 0x0201, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_force_y8 }, /* Oculus VR Rift Sensor */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x2833, .idProduct = 0x0211, .bInterfaceClass = USB_CLASS_VENDOR_SPEC, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_force_y8 }, /* GEO Semiconductor GC6500 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x29fe, .idProduct = 0x4d53, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_QUIRK(UVC_QUIRK_FORCE_BPP) }, /* SunplusIT Inc HD Camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x2b7e, .idProduct = 0xb752, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = UVC_PC_PROTOCOL_15, .driver_info = (kernel_ulong_t)&uvc_ctrl_power_line_uvc11 }, /* Lenovo Integrated Camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x30c9, .idProduct = 0x0093, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = UVC_PC_PROTOCOL_15, .driver_info = (kernel_ulong_t)&uvc_ctrl_power_line_uvc11 }, /* Sonix Technology USB 2.0 Camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x3277, .idProduct = 0x0072, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_ctrl_power_line_limited }, /* Acer EasyCamera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x5986, .idProduct = 0x1172, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_ctrl_power_line_limited }, /* Acer EasyCamera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x5986, .idProduct = 0x1180, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_ctrl_power_line_limited }, /* Intel D410/ASR depth camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x8086, .idProduct = 0x0ad2, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_META(V4L2_META_FMT_D4XX) }, /* Intel D415/ASRC depth camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x8086, .idProduct = 0x0ad3, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_META(V4L2_META_FMT_D4XX) }, /* Intel D430/AWG depth camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x8086, .idProduct = 0x0ad4, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_META(V4L2_META_FMT_D4XX) }, /* Intel RealSense D4M */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x8086, .idProduct = 0x0b03, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_META(V4L2_META_FMT_D4XX) }, /* Intel D435/AWGC depth camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x8086, .idProduct = 0x0b07, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_META(V4L2_META_FMT_D4XX) }, /* Intel D435i depth camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x8086, .idProduct = 0x0b3a, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_META(V4L2_META_FMT_D4XX) }, /* Intel D405 Depth Camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x8086, .idProduct = 0x0b5b, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_META(V4L2_META_FMT_D4XX) }, /* Intel D455 Depth Camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x8086, .idProduct = 0x0b5c, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_META(V4L2_META_FMT_D4XX) }, /* Generic USB Video Class */ { USB_INTERFACE_INFO(USB_CLASS_VIDEO, 1, UVC_PC_PROTOCOL_UNDEFINED) }, { USB_INTERFACE_INFO(USB_CLASS_VIDEO, 1, UVC_PC_PROTOCOL_15) }, {} }; MODULE_DEVICE_TABLE(usb, uvc_ids); struct uvc_driver uvc_driver = { .driver = { .name = "uvcvideo", .probe = uvc_probe, .disconnect = uvc_disconnect, .suspend = uvc_suspend, .resume = uvc_resume, .reset_resume = uvc_reset_resume, .id_table = uvc_ids, .supports_autosuspend = 1, }, }; static int __init uvc_init(void) { int ret; uvc_debugfs_init(); ret = usb_register(&uvc_driver.driver); if (ret < 0) { uvc_debugfs_cleanup(); return ret; } return 0; } static void __exit uvc_cleanup(void) { usb_deregister(&uvc_driver.driver); uvc_debugfs_cleanup(); } module_init(uvc_init); module_exit(uvc_cleanup); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); MODULE_VERSION(DRIVER_VERSION);
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 /* * linux/fs/nls/nls_koi8-r.c * * Charset koi8-r translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590, /* 0x90*/ 0x2591, 0x2592, 0x2593, 0x2320, 0x25a0, 0x2219, 0x221a, 0x2248, 0x2264, 0x2265, 0x00a0, 0x2321, 0x00b0, 0x00b2, 0x00b7, 0x00f7, /* 0xa0*/ 0x2550, 0x2551, 0x2552, 0x0451, 0x2553, 0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 0x255e, /* 0xb0*/ 0x255f, 0x2560, 0x2561, 0x0401, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567, 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x00a9, /* 0xc0*/ 0x044e, 0x0430, 0x0431, 0x0446, 0x0434, 0x0435, 0x0444, 0x0433, 0x0445, 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, /* 0xd0*/ 0x043f, 0x044f, 0x0440, 0x0441, 0x0442, 0x0443, 0x0436, 0x0432, 0x044c, 0x044b, 0x0437, 0x0448, 0x044d, 0x0449, 0x0447, 0x044a, /* 0xe0*/ 0x042e, 0x0410, 0x0411, 0x0426, 0x0414, 0x0415, 0x0424, 0x0413, 0x0425, 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, /* 0xf0*/ 0x041f, 0x042f, 0x0420, 0x0421, 0x0422, 0x0423, 0x0416, 0x0412, 0x042c, 0x042b, 0x0417, 0x0428, 0x042d, 0x0429, 0x0427, 0x042a, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x9a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x9c, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9f, /* 0xf0-0xf7 */ }; static const unsigned char page04[256] = { 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0xe1, 0xe2, 0xf7, 0xe7, 0xe4, 0xe5, 0xf6, 0xfa, /* 0x10-0x17 */ 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, /* 0x18-0x1f */ 0xf2, 0xf3, 0xf4, 0xf5, 0xe6, 0xe8, 0xe3, 0xfe, /* 0x20-0x27 */ 0xfb, 0xfd, 0xff, 0xf9, 0xf8, 0xfc, 0xe0, 0xf1, /* 0x28-0x2f */ 0xc1, 0xc2, 0xd7, 0xc7, 0xc4, 0xc5, 0xd6, 0xda, /* 0x30-0x37 */ 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, /* 0x38-0x3f */ 0xd2, 0xd3, 0xd4, 0xd5, 0xc6, 0xc8, 0xc3, 0xde, /* 0x40-0x47 */ 0xdb, 0xdd, 0xdf, 0xd9, 0xd8, 0xdc, 0xc0, 0xd1, /* 0x48-0x4f */ 0x00, 0xa3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ }; static const unsigned char page22[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x95, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x98, 0x99, 0x00, 0x00, /* 0x60-0x67 */ }; static const unsigned char page23[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x93, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ }; static const unsigned char page25[256] = { 0x80, 0x00, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x83, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x85, 0x00, 0x00, 0x00, 0x86, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0xa0, 0xa1, 0xa2, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, /* 0x50-0x57 */ 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, /* 0x58-0x5f */ 0xb1, 0xb2, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, /* 0x60-0x67 */ 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x8b, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x8d, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x8f, 0x90, 0x91, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ }; static const unsigned char *const page_uni2charset[256] = { page00, NULL, NULL, NULL, page04, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page22, page23, NULL, page25, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xa3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xf0-0xf7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xb3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xd0-0xd7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "koi8-r", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_koi8_r(void) { return register_nls(&table); } static void __exit exit_nls_koi8_r(void) { unregister_nls(&table); } module_init(init_nls_koi8_r) module_exit(exit_nls_koi8_r) MODULE_LICENSE("Dual BSD/GPL");
14 7 7 9 3 9 1 4 6 3 5 6 5 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/module.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tproxy.h> #include <net/inet_sock.h> #include <net/tcp.h> #include <linux/if_ether.h> #include <net/netfilter/ipv4/nf_defrag_ipv4.h> #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #endif struct nft_tproxy { u8 sreg_addr; u8 sreg_port; u8 family; }; static void nft_tproxy_eval_v4(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_tproxy *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; const struct iphdr *iph = ip_hdr(skb); struct udphdr _hdr, *hp; __be32 taddr = 0; __be16 tport = 0; struct sock *sk; if (pkt->tprot != IPPROTO_TCP && pkt->tprot != IPPROTO_UDP) { regs->verdict.code = NFT_BREAK; return; } hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr); if (!hp) { regs->verdict.code = NFT_BREAK; return; } /* check if there's an ongoing connection on the packet addresses, this * happens if the redirect already happened and the current packet * belongs to an already established connection */ sk = nf_tproxy_get_sock_v4(nft_net(pkt), skb, iph->protocol, iph->saddr, iph->daddr, hp->source, hp->dest, skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED); if (priv->sreg_addr) taddr = nft_reg_load_be32(&regs->data[priv->sreg_addr]); taddr = nf_tproxy_laddr4(skb, taddr, iph->daddr); if (priv->sreg_port) tport = nft_reg_load_be16(&regs->data[priv->sreg_port]); if (!tport) tport = hp->dest; /* UDP has no TCP_TIME_WAIT state, so we never enter here */ if (sk && sk->sk_state == TCP_TIME_WAIT) { /* reopening a TIME_WAIT connection needs special handling */ sk = nf_tproxy_handle_time_wait4(nft_net(pkt), skb, taddr, tport, sk); } else if (!sk) { /* no, there's no established connection, check if * there's a listener on the redirected addr/port */ sk = nf_tproxy_get_sock_v4(nft_net(pkt), skb, iph->protocol, iph->saddr, taddr, hp->source, tport, skb->dev, NF_TPROXY_LOOKUP_LISTENER); } if (sk && nf_tproxy_sk_is_transparent(sk)) nf_tproxy_assign_sock(skb, sk); else regs->verdict.code = NFT_BREAK; } #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) static void nft_tproxy_eval_v6(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_tproxy *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; const struct ipv6hdr *iph = ipv6_hdr(skb); int thoff = nft_thoff(pkt); struct udphdr _hdr, *hp; struct in6_addr taddr; __be16 tport = 0; struct sock *sk; int l4proto; memset(&taddr, 0, sizeof(taddr)); if (pkt->tprot != IPPROTO_TCP && pkt->tprot != IPPROTO_UDP) { regs->verdict.code = NFT_BREAK; return; } l4proto = pkt->tprot; hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr); if (hp == NULL) { regs->verdict.code = NFT_BREAK; return; } /* check if there's an ongoing connection on the packet addresses, this * happens if the redirect already happened and the current packet * belongs to an already established connection */ sk = nf_tproxy_get_sock_v6(nft_net(pkt), skb, thoff, l4proto, &iph->saddr, &iph->daddr, hp->source, hp->dest, nft_in(pkt), NF_TPROXY_LOOKUP_ESTABLISHED); if (priv->sreg_addr) memcpy(&taddr, &regs->data[priv->sreg_addr], sizeof(taddr)); taddr = *nf_tproxy_laddr6(skb, &taddr, &iph->daddr); if (priv->sreg_port) tport = nft_reg_load_be16(&regs->data[priv->sreg_port]); if (!tport) tport = hp->dest; /* UDP has no TCP_TIME_WAIT state, so we never enter here */ if (sk && sk->sk_state == TCP_TIME_WAIT) { /* reopening a TIME_WAIT connection needs special handling */ sk = nf_tproxy_handle_time_wait6(skb, l4proto, thoff, nft_net(pkt), &taddr, tport, sk); } else if (!sk) { /* no there's no established connection, check if * there's a listener on the redirected addr/port */ sk = nf_tproxy_get_sock_v6(nft_net(pkt), skb, thoff, l4proto, &iph->saddr, &taddr, hp->source, tport, nft_in(pkt), NF_TPROXY_LOOKUP_LISTENER); } /* NOTE: assign_sock consumes our sk reference */ if (sk && nf_tproxy_sk_is_transparent(sk)) nf_tproxy_assign_sock(skb, sk); else regs->verdict.code = NFT_BREAK; } #endif static void nft_tproxy_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_tproxy *priv = nft_expr_priv(expr); switch (nft_pf(pkt)) { case NFPROTO_IPV4: switch (priv->family) { case NFPROTO_IPV4: case NFPROTO_UNSPEC: nft_tproxy_eval_v4(expr, regs, pkt); return; } break; #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case NFPROTO_IPV6: switch (priv->family) { case NFPROTO_IPV6: case NFPROTO_UNSPEC: nft_tproxy_eval_v6(expr, regs, pkt); return; } #endif } regs->verdict.code = NFT_BREAK; } static const struct nla_policy nft_tproxy_policy[NFTA_TPROXY_MAX + 1] = { [NFTA_TPROXY_FAMILY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_TPROXY_REG_ADDR] = { .type = NLA_U32 }, [NFTA_TPROXY_REG_PORT] = { .type = NLA_U32 }, }; static int nft_tproxy_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_tproxy *priv = nft_expr_priv(expr); unsigned int alen = 0; int err; if (!tb[NFTA_TPROXY_FAMILY] || (!tb[NFTA_TPROXY_REG_ADDR] && !tb[NFTA_TPROXY_REG_PORT])) return -EINVAL; priv->family = ntohl(nla_get_be32(tb[NFTA_TPROXY_FAMILY])); switch (ctx->family) { case NFPROTO_IPV4: if (priv->family != NFPROTO_IPV4) return -EINVAL; break; #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case NFPROTO_IPV6: if (priv->family != NFPROTO_IPV6) return -EINVAL; break; #endif case NFPROTO_INET: break; default: return -EOPNOTSUPP; } /* Address is specified but the rule family is not set accordingly */ if (priv->family == NFPROTO_UNSPEC && tb[NFTA_TPROXY_REG_ADDR]) return -EINVAL; switch (priv->family) { case NFPROTO_IPV4: alen = sizeof_field(union nf_inet_addr, in); err = nf_defrag_ipv4_enable(ctx->net); if (err) return err; break; #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case NFPROTO_IPV6: alen = sizeof_field(union nf_inet_addr, in6); err = nf_defrag_ipv6_enable(ctx->net); if (err) return err; break; #endif case NFPROTO_UNSPEC: /* No address is specified here */ err = nf_defrag_ipv4_enable(ctx->net); if (err) return err; #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) err = nf_defrag_ipv6_enable(ctx->net); if (err) return err; #endif break; default: return -EOPNOTSUPP; } if (tb[NFTA_TPROXY_REG_ADDR]) { err = nft_parse_register_load(tb[NFTA_TPROXY_REG_ADDR], &priv->sreg_addr, alen); if (err < 0) return err; } if (tb[NFTA_TPROXY_REG_PORT]) { err = nft_parse_register_load(tb[NFTA_TPROXY_REG_PORT], &priv->sreg_port, sizeof(u16)); if (err < 0) return err; } return 0; } static void nft_tproxy_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_tproxy *priv = nft_expr_priv(expr); switch (priv->family) { case NFPROTO_IPV4: nf_defrag_ipv4_disable(ctx->net); break; #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case NFPROTO_IPV6: nf_defrag_ipv6_disable(ctx->net); break; #endif case NFPROTO_UNSPEC: nf_defrag_ipv4_disable(ctx->net); #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) nf_defrag_ipv6_disable(ctx->net); #endif break; } } static int nft_tproxy_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_tproxy *priv = nft_expr_priv(expr); if (nla_put_be32(skb, NFTA_TPROXY_FAMILY, htonl(priv->family))) return -1; if (priv->sreg_addr && nft_dump_register(skb, NFTA_TPROXY_REG_ADDR, priv->sreg_addr)) return -1; if (priv->sreg_port && nft_dump_register(skb, NFTA_TPROXY_REG_PORT, priv->sreg_port)) return -1; return 0; } static int nft_tproxy_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) { return nft_chain_validate_hooks(ctx->chain, 1 << NF_INET_PRE_ROUTING); } static struct nft_expr_type nft_tproxy_type; static const struct nft_expr_ops nft_tproxy_ops = { .type = &nft_tproxy_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_tproxy)), .eval = nft_tproxy_eval, .init = nft_tproxy_init, .destroy = nft_tproxy_destroy, .dump = nft_tproxy_dump, .reduce = NFT_REDUCE_READONLY, .validate = nft_tproxy_validate, }; static struct nft_expr_type nft_tproxy_type __read_mostly = { .name = "tproxy", .ops = &nft_tproxy_ops, .policy = nft_tproxy_policy, .maxattr = NFTA_TPROXY_MAX, .owner = THIS_MODULE, }; static int __init nft_tproxy_module_init(void) { return nft_register_expr(&nft_tproxy_type); } static void __exit nft_tproxy_module_exit(void) { nft_unregister_expr(&nft_tproxy_type); } module_init(nft_tproxy_module_init); module_exit(nft_tproxy_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Máté Eckl"); MODULE_DESCRIPTION("nf_tables tproxy support module"); MODULE_ALIAS_NFT_EXPR("tproxy");
1 4 3 3 2 4 3 5 1 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 // SPDX-License-Identifier: GPL-2.0-only /* * vsock sock_diag(7) module * * Copyright (C) 2017 Red Hat, Inc. * Author: Stefan Hajnoczi <stefanha@redhat.com> */ #include <linux/module.h> #include <linux/sock_diag.h> #include <linux/vm_sockets_diag.h> #include <net/af_vsock.h> static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, u32 portid, u32 seq, u32 flags) { struct vsock_sock *vsk = vsock_sk(sk); struct vsock_diag_msg *rep; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep), flags); if (!nlh) return -EMSGSIZE; rep = nlmsg_data(nlh); rep->vdiag_family = AF_VSOCK; /* Lock order dictates that sk_lock is acquired before * vsock_table_lock, so we cannot lock here. Simply don't take * sk_lock; sk is guaranteed to stay alive since vsock_table_lock is * held. */ rep->vdiag_type = sk->sk_type; rep->vdiag_state = sk->sk_state; rep->vdiag_shutdown = sk->sk_shutdown; rep->vdiag_src_cid = vsk->local_addr.svm_cid; rep->vdiag_src_port = vsk->local_addr.svm_port; rep->vdiag_dst_cid = vsk->remote_addr.svm_cid; rep->vdiag_dst_port = vsk->remote_addr.svm_port; rep->vdiag_ino = sock_i_ino(sk); sock_diag_save_cookie(sk, rep->vdiag_cookie); return 0; } static int vsock_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct vsock_diag_req *req; struct vsock_sock *vsk; unsigned int bucket; unsigned int last_i; unsigned int table; struct net *net; unsigned int i; req = nlmsg_data(cb->nlh); net = sock_net(skb->sk); /* State saved between calls: */ table = cb->args[0]; bucket = cb->args[1]; i = last_i = cb->args[2]; /* TODO VMCI pending sockets? */ spin_lock_bh(&vsock_table_lock); /* Bind table (locally created sockets) */ if (table == 0) { while (bucket < ARRAY_SIZE(vsock_bind_table)) { struct list_head *head = &vsock_bind_table[bucket]; i = 0; list_for_each_entry(vsk, head, bound_table) { struct sock *sk = sk_vsock(vsk); if (!net_eq(sock_net(sk), net)) continue; if (i < last_i) goto next_bind; if (!(req->vdiag_states & (1 << sk->sk_state))) goto next_bind; if (sk_diag_fill(sk, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0) goto done; next_bind: i++; } last_i = 0; bucket++; } table++; bucket = 0; } /* Connected table (accepted connections) */ while (bucket < ARRAY_SIZE(vsock_connected_table)) { struct list_head *head = &vsock_connected_table[bucket]; i = 0; list_for_each_entry(vsk, head, connected_table) { struct sock *sk = sk_vsock(vsk); /* Skip sockets we've already seen above */ if (__vsock_in_bound_table(vsk)) continue; if (!net_eq(sock_net(sk), net)) continue; if (i < last_i) goto next_connected; if (!(req->vdiag_states & (1 << sk->sk_state))) goto next_connected; if (sk_diag_fill(sk, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0) goto done; next_connected: i++; } last_i = 0; bucket++; } done: spin_unlock_bh(&vsock_table_lock); cb->args[0] = table; cb->args[1] = bucket; cb->args[2] = i; return skb->len; } static int vsock_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) { int hdrlen = sizeof(struct vsock_diag_req); struct net *net = sock_net(skb->sk); if (nlmsg_len(h) < hdrlen) return -EINVAL; if (h->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = vsock_diag_dump, }; return netlink_dump_start(net->diag_nlsk, skb, h, &c); } return -EOPNOTSUPP; } static const struct sock_diag_handler vsock_diag_handler = { .family = AF_VSOCK, .dump = vsock_diag_handler_dump, }; static int __init vsock_diag_init(void) { return sock_diag_register(&vsock_diag_handler); } static void __exit vsock_diag_exit(void) { sock_diag_unregister(&vsock_diag_handler); } module_init(vsock_diag_init); module_exit(vsock_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("VMware Virtual Sockets monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 40 /* AF_VSOCK */);
45 71 70 2 45 71 69 12 69 48 1 64 2 30 72 2 5 45 3 3 3 3 45 13 45 45 45 6 1 41 3 14 38 43 5 1 14 38 2 8 1 36 33 33 4 4 1 1 13 6 6 3 2 3 3 2 2 2 2 2 2 1 1 1 5 3 2 2 1 1 2 1 1 2 1 1 2 2 4 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 // SPDX-License-Identifier: GPL-2.0-only /* * namei.c * * PURPOSE * Inode name handling routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT * (C) 1998-2004 Ben Fennema * (C) 1999-2000 Stelias Computing Inc * * HISTORY * * 12/12/98 blf Created. Split out the lookup code from dir.c * 04/19/99 blf link, mknod, symlink support */ #include "udfdecl.h" #include "udf_i.h" #include "udf_sb.h" #include <linux/string.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/crc-itu-t.h> #include <linux/exportfs.h> #include <linux/iversion.h> static inline int udf_match(int len1, const unsigned char *name1, int len2, const unsigned char *name2) { if (len1 != len2) return 0; return !memcmp(name1, name2, len1); } /** * udf_fiiter_find_entry - find entry in given directory. * * @dir: directory inode to search in * @child: qstr of the name * @iter: iter to use for searching * * This function searches in the directory @dir for a file name @child. When * found, @iter points to the position in the directory with given entry. * * Returns 0 on success, < 0 on error (including -ENOENT). */ static int udf_fiiter_find_entry(struct inode *dir, const struct qstr *child, struct udf_fileident_iter *iter) { int flen; unsigned char *fname = NULL; struct super_block *sb = dir->i_sb; int isdotdot = child->len == 2 && child->name[0] == '.' && child->name[1] == '.'; int ret; fname = kmalloc(UDF_NAME_LEN, GFP_NOFS); if (!fname) return -ENOMEM; for (ret = udf_fiiter_init(iter, dir, 0); !ret && iter->pos < dir->i_size; ret = udf_fiiter_advance(iter)) { if (iter->fi.fileCharacteristics & FID_FILE_CHAR_DELETED) { if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE)) continue; } if (iter->fi.fileCharacteristics & FID_FILE_CHAR_HIDDEN) { if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE)) continue; } if ((iter->fi.fileCharacteristics & FID_FILE_CHAR_PARENT) && isdotdot) goto out_ok; if (!iter->fi.lengthFileIdent) continue; flen = udf_get_filename(sb, iter->name, iter->fi.lengthFileIdent, fname, UDF_NAME_LEN); if (flen < 0) { ret = flen; goto out_err; } if (udf_match(flen, fname, child->len, child->name)) goto out_ok; } if (!ret) ret = -ENOENT; out_err: udf_fiiter_release(iter); out_ok: kfree(fname); return ret; } static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode = NULL; struct udf_fileident_iter iter; int err; if (dentry->d_name.len > UDF_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); err = udf_fiiter_find_entry(dir, &dentry->d_name, &iter); if (err < 0 && err != -ENOENT) return ERR_PTR(err); if (err == 0) { struct kernel_lb_addr loc; loc = lelb_to_cpu(iter.fi.icb.extLocation); udf_fiiter_release(&iter); inode = udf_iget(dir->i_sb, &loc); } return d_splice_alias(inode, dentry); } static int udf_expand_dir_adinicb(struct inode *inode, udf_pblk_t *block) { udf_pblk_t newblock; struct buffer_head *dbh = NULL; struct kernel_lb_addr eloc; struct extent_position epos; uint8_t alloctype; struct udf_inode_info *iinfo = UDF_I(inode); struct udf_fileident_iter iter; uint8_t *impuse; int ret; if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) alloctype = ICBTAG_FLAG_AD_SHORT; else alloctype = ICBTAG_FLAG_AD_LONG; if (!inode->i_size) { iinfo->i_alloc_type = alloctype; mark_inode_dirty(inode); return 0; } /* alloc block, and copy data to it */ *block = udf_new_block(inode->i_sb, inode, iinfo->i_location.partitionReferenceNum, iinfo->i_location.logicalBlockNum, &ret); if (!(*block)) return ret; newblock = udf_get_pblock(inode->i_sb, *block, iinfo->i_location.partitionReferenceNum, 0); if (newblock == 0xffffffff) return -EFSCORRUPTED; dbh = sb_getblk(inode->i_sb, newblock); if (!dbh) return -ENOMEM; lock_buffer(dbh); memcpy(dbh->b_data, iinfo->i_data, inode->i_size); memset(dbh->b_data + inode->i_size, 0, inode->i_sb->s_blocksize - inode->i_size); set_buffer_uptodate(dbh); unlock_buffer(dbh); /* Drop inline data, add block instead */ iinfo->i_alloc_type = alloctype; memset(iinfo->i_data + iinfo->i_lenEAttr, 0, iinfo->i_lenAlloc); iinfo->i_lenAlloc = 0; eloc.logicalBlockNum = *block; eloc.partitionReferenceNum = iinfo->i_location.partitionReferenceNum; iinfo->i_lenExtents = inode->i_size; epos.bh = NULL; epos.block = iinfo->i_location; epos.offset = udf_file_entry_alloc_offset(inode); ret = udf_add_aext(inode, &epos, &eloc, inode->i_size, 0); brelse(epos.bh); if (ret < 0) { brelse(dbh); udf_free_blocks(inode->i_sb, inode, &eloc, 0, 1); return ret; } mark_inode_dirty(inode); /* Now fixup tags in moved directory entries */ for (ret = udf_fiiter_init(&iter, inode, 0); !ret && iter.pos < inode->i_size; ret = udf_fiiter_advance(&iter)) { iter.fi.descTag.tagLocation = cpu_to_le32(*block); if (iter.fi.lengthOfImpUse != cpu_to_le16(0)) impuse = dbh->b_data + iter.pos + sizeof(struct fileIdentDesc); else impuse = NULL; udf_fiiter_write_fi(&iter, impuse); } brelse(dbh); /* * We don't expect the iteration to fail as the directory has been * already verified to be correct */ WARN_ON_ONCE(ret); udf_fiiter_release(&iter); return 0; } static int udf_fiiter_add_entry(struct inode *dir, struct dentry *dentry, struct udf_fileident_iter *iter) { struct udf_inode_info *dinfo = UDF_I(dir); int nfidlen, namelen = 0; int ret; int off, blksize = 1 << dir->i_blkbits; udf_pblk_t block; char name[UDF_NAME_LEN_CS0]; if (dentry) { namelen = udf_put_filename(dir->i_sb, dentry->d_name.name, dentry->d_name.len, name, UDF_NAME_LEN_CS0); if (!namelen) return -ENAMETOOLONG; } nfidlen = ALIGN(sizeof(struct fileIdentDesc) + namelen, UDF_NAME_PAD); for (ret = udf_fiiter_init(iter, dir, 0); !ret && iter->pos < dir->i_size; ret = udf_fiiter_advance(iter)) { if (iter->fi.fileCharacteristics & FID_FILE_CHAR_DELETED) { if (udf_dir_entry_len(&iter->fi) == nfidlen) { iter->fi.descTag.tagSerialNum = cpu_to_le16(1); iter->fi.fileVersionNum = cpu_to_le16(1); iter->fi.fileCharacteristics = 0; iter->fi.lengthFileIdent = namelen; iter->fi.lengthOfImpUse = cpu_to_le16(0); memcpy(iter->namebuf, name, namelen); iter->name = iter->namebuf; return 0; } } } if (ret) { udf_fiiter_release(iter); return ret; } if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB && blksize - udf_ext0_offset(dir) - iter->pos < nfidlen) { udf_fiiter_release(iter); ret = udf_expand_dir_adinicb(dir, &block); if (ret) return ret; ret = udf_fiiter_init(iter, dir, dir->i_size); if (ret < 0) return ret; } /* Get blocknumber to use for entry tag */ if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { block = dinfo->i_location.logicalBlockNum; } else { block = iter->eloc.logicalBlockNum + ((iter->elen - 1) >> dir->i_blkbits); } off = iter->pos & (blksize - 1); if (!off) off = blksize; /* Entry fits into current block? */ if (blksize - udf_ext0_offset(dir) - off >= nfidlen) goto store_fi; ret = udf_fiiter_append_blk(iter); if (ret) { udf_fiiter_release(iter); return ret; } /* Entry will be completely in the new block? Update tag location... */ if (!(iter->pos & (blksize - 1))) block = iter->eloc.logicalBlockNum + ((iter->elen - 1) >> dir->i_blkbits); store_fi: memset(&iter->fi, 0, sizeof(struct fileIdentDesc)); if (UDF_SB(dir->i_sb)->s_udfrev >= 0x0200) udf_new_tag((char *)(&iter->fi), TAG_IDENT_FID, 3, 1, block, sizeof(struct tag)); else udf_new_tag((char *)(&iter->fi), TAG_IDENT_FID, 2, 1, block, sizeof(struct tag)); iter->fi.fileVersionNum = cpu_to_le16(1); iter->fi.lengthFileIdent = namelen; iter->fi.lengthOfImpUse = cpu_to_le16(0); memcpy(iter->namebuf, name, namelen); iter->name = iter->namebuf; dir->i_size += nfidlen; if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { dinfo->i_lenAlloc += nfidlen; } else { /* Truncate last extent to proper size */ udf_fiiter_update_elen(iter, iter->elen - (dinfo->i_lenExtents - dir->i_size)); } mark_inode_dirty(dir); return 0; } static void udf_fiiter_delete_entry(struct udf_fileident_iter *iter) { iter->fi.fileCharacteristics |= FID_FILE_CHAR_DELETED; if (UDF_QUERY_FLAG(iter->dir->i_sb, UDF_FLAG_STRICT)) memset(&iter->fi.icb, 0x00, sizeof(struct long_ad)); udf_fiiter_write_fi(iter, NULL); } static void udf_add_fid_counter(struct super_block *sb, bool dir, int val) { struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb); if (!lvidiu) return; mutex_lock(&UDF_SB(sb)->s_alloc_mutex); if (dir) le32_add_cpu(&lvidiu->numDirs, val); else le32_add_cpu(&lvidiu->numFiles, val); udf_updated_lvid(sb); mutex_unlock(&UDF_SB(sb)->s_alloc_mutex); } static int udf_add_nondir(struct dentry *dentry, struct inode *inode) { struct udf_inode_info *iinfo = UDF_I(inode); struct inode *dir = d_inode(dentry->d_parent); struct udf_fileident_iter iter; int err; err = udf_fiiter_add_entry(dir, dentry, &iter); if (err) { inode_dec_link_count(inode); discard_new_inode(inode); return err; } iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); iter.fi.icb.extLocation = cpu_to_lelb(iinfo->i_location); *(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse = cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL); udf_fiiter_write_fi(&iter, NULL); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); udf_fiiter_release(&iter); udf_add_fid_counter(dir->i_sb, false, 1); d_instantiate_new(dentry, inode); return 0; } static int udf_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode = udf_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); inode->i_data.a_ops = &udf_aops; inode->i_op = &udf_file_inode_operations; inode->i_fop = &udf_file_operations; mark_inode_dirty(inode); return udf_add_nondir(dentry, inode); } static int udf_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode = udf_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); inode->i_data.a_ops = &udf_aops; inode->i_op = &udf_file_inode_operations; inode->i_fop = &udf_file_operations; mark_inode_dirty(inode); d_tmpfile(file, inode); unlock_new_inode(inode); return finish_open_simple(file, 0); } static int udf_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; if (!old_valid_dev(rdev)) return -EINVAL; inode = udf_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); init_special_inode(inode, mode, rdev); return udf_add_nondir(dentry, inode); } static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; struct udf_fileident_iter iter; int err; struct udf_inode_info *dinfo = UDF_I(dir); struct udf_inode_info *iinfo; inode = udf_new_inode(dir, S_IFDIR | mode); if (IS_ERR(inode)) return PTR_ERR(inode); iinfo = UDF_I(inode); inode->i_op = &udf_dir_inode_operations; inode->i_fop = &udf_dir_operations; err = udf_fiiter_add_entry(inode, NULL, &iter); if (err) { clear_nlink(inode); discard_new_inode(inode); return err; } set_nlink(inode, 2); iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); iter.fi.icb.extLocation = cpu_to_lelb(dinfo->i_location); *(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse = cpu_to_le32(dinfo->i_unique & 0x00000000FFFFFFFFUL); iter.fi.fileCharacteristics = FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT; udf_fiiter_write_fi(&iter, NULL); udf_fiiter_release(&iter); mark_inode_dirty(inode); err = udf_fiiter_add_entry(dir, dentry, &iter); if (err) { clear_nlink(inode); discard_new_inode(inode); return err; } iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); iter.fi.icb.extLocation = cpu_to_lelb(iinfo->i_location); *(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse = cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL); iter.fi.fileCharacteristics |= FID_FILE_CHAR_DIRECTORY; udf_fiiter_write_fi(&iter, NULL); udf_fiiter_release(&iter); udf_add_fid_counter(dir->i_sb, true, 1); inc_nlink(dir); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); d_instantiate_new(dentry, inode); return 0; } static int empty_dir(struct inode *dir) { struct udf_fileident_iter iter; int ret; for (ret = udf_fiiter_init(&iter, dir, 0); !ret && iter.pos < dir->i_size; ret = udf_fiiter_advance(&iter)) { if (iter.fi.lengthFileIdent && !(iter.fi.fileCharacteristics & FID_FILE_CHAR_DELETED)) { udf_fiiter_release(&iter); return 0; } } udf_fiiter_release(&iter); return 1; } static int udf_rmdir(struct inode *dir, struct dentry *dentry) { int ret; struct inode *inode = d_inode(dentry); struct udf_fileident_iter iter; struct kernel_lb_addr tloc; ret = udf_fiiter_find_entry(dir, &dentry->d_name, &iter); if (ret) goto out; ret = -EFSCORRUPTED; tloc = lelb_to_cpu(iter.fi.icb.extLocation); if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino) goto end_rmdir; ret = -ENOTEMPTY; if (!empty_dir(inode)) goto end_rmdir; udf_fiiter_delete_entry(&iter); if (inode->i_nlink != 2) udf_warn(inode->i_sb, "empty directory has nlink != 2 (%u)\n", inode->i_nlink); clear_nlink(inode); inode->i_size = 0; inode_dec_link_count(dir); udf_add_fid_counter(dir->i_sb, true, -1); inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); mark_inode_dirty(dir); ret = 0; end_rmdir: udf_fiiter_release(&iter); out: return ret; } static int udf_unlink(struct inode *dir, struct dentry *dentry) { int ret; struct inode *inode = d_inode(dentry); struct udf_fileident_iter iter; struct kernel_lb_addr tloc; ret = udf_fiiter_find_entry(dir, &dentry->d_name, &iter); if (ret) goto out; ret = -EFSCORRUPTED; tloc = lelb_to_cpu(iter.fi.icb.extLocation); if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino) goto end_unlink; if (!inode->i_nlink) { udf_debug("Deleting nonexistent file (%lu), %u\n", inode->i_ino, inode->i_nlink); set_nlink(inode, 1); } udf_fiiter_delete_entry(&iter); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); inode_dec_link_count(inode); udf_add_fid_counter(dir->i_sb, false, -1); inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); ret = 0; end_unlink: udf_fiiter_release(&iter); out: return ret; } static int udf_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct inode *inode = udf_new_inode(dir, S_IFLNK | 0777); struct pathComponent *pc; const char *compstart; struct extent_position epos = {}; int eoffset, elen = 0; uint8_t *ea; int err; udf_pblk_t block; unsigned char *name = NULL; int namelen; struct udf_inode_info *iinfo; struct super_block *sb = dir->i_sb; if (IS_ERR(inode)) return PTR_ERR(inode); iinfo = UDF_I(inode); down_write(&iinfo->i_data_sem); name = kmalloc(UDF_NAME_LEN_CS0, GFP_NOFS); if (!name) { err = -ENOMEM; goto out_no_entry; } inode->i_data.a_ops = &udf_symlink_aops; inode->i_op = &udf_symlink_inode_operations; inode_nohighmem(inode); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { struct kernel_lb_addr eloc; uint32_t bsize; block = udf_new_block(sb, inode, iinfo->i_location.partitionReferenceNum, iinfo->i_location.logicalBlockNum, &err); if (!block) goto out_no_entry; epos.block = iinfo->i_location; epos.offset = udf_file_entry_alloc_offset(inode); epos.bh = NULL; eloc.logicalBlockNum = block; eloc.partitionReferenceNum = iinfo->i_location.partitionReferenceNum; bsize = sb->s_blocksize; iinfo->i_lenExtents = bsize; err = udf_add_aext(inode, &epos, &eloc, bsize, 0); brelse(epos.bh); if (err < 0) { udf_free_blocks(sb, inode, &eloc, 0, 1); goto out_no_entry; } block = udf_get_pblock(sb, block, iinfo->i_location.partitionReferenceNum, 0); epos.bh = sb_getblk(sb, block); if (unlikely(!epos.bh)) { err = -ENOMEM; udf_free_blocks(sb, inode, &eloc, 0, 1); goto out_no_entry; } lock_buffer(epos.bh); memset(epos.bh->b_data, 0x00, bsize); set_buffer_uptodate(epos.bh); unlock_buffer(epos.bh); mark_buffer_dirty_inode(epos.bh, inode); ea = epos.bh->b_data + udf_ext0_offset(inode); } else ea = iinfo->i_data + iinfo->i_lenEAttr; eoffset = sb->s_blocksize - udf_ext0_offset(inode); pc = (struct pathComponent *)ea; if (*symname == '/') { do { symname++; } while (*symname == '/'); pc->componentType = 1; pc->lengthComponentIdent = 0; pc->componentFileVersionNum = 0; elen += sizeof(struct pathComponent); } err = -ENAMETOOLONG; while (*symname) { if (elen + sizeof(struct pathComponent) > eoffset) goto out_no_entry; pc = (struct pathComponent *)(ea + elen); compstart = symname; do { symname++; } while (*symname && *symname != '/'); pc->componentType = 5; pc->lengthComponentIdent = 0; pc->componentFileVersionNum = 0; if (compstart[0] == '.') { if ((symname - compstart) == 1) pc->componentType = 4; else if ((symname - compstart) == 2 && compstart[1] == '.') pc->componentType = 3; } if (pc->componentType == 5) { namelen = udf_put_filename(sb, compstart, symname - compstart, name, UDF_NAME_LEN_CS0); if (!namelen) goto out_no_entry; if (elen + sizeof(struct pathComponent) + namelen > eoffset) goto out_no_entry; else pc->lengthComponentIdent = namelen; memcpy(pc->componentIdent, name, namelen); } elen += sizeof(struct pathComponent) + pc->lengthComponentIdent; if (*symname) { do { symname++; } while (*symname == '/'); } } brelse(epos.bh); inode->i_size = elen; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) iinfo->i_lenAlloc = inode->i_size; else udf_truncate_tail_extent(inode); mark_inode_dirty(inode); up_write(&iinfo->i_data_sem); err = udf_add_nondir(dentry, inode); out: kfree(name); return err; out_no_entry: up_write(&iinfo->i_data_sem); inode_dec_link_count(inode); discard_new_inode(inode); goto out; } static int udf_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); struct udf_fileident_iter iter; int err; err = udf_fiiter_add_entry(dir, dentry, &iter); if (err) return err; iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); iter.fi.icb.extLocation = cpu_to_lelb(UDF_I(inode)->i_location); if (UDF_SB(inode->i_sb)->s_lvid_bh) { *(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse = cpu_to_le32(lvid_get_unique_id(inode->i_sb)); } udf_fiiter_write_fi(&iter, NULL); udf_fiiter_release(&iter); inc_nlink(inode); udf_add_fid_counter(dir->i_sb, false, 1); inode_set_ctime_current(inode); mark_inode_dirty(inode); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); ihold(inode); d_instantiate(dentry, inode); return 0; } /* Anybody can rename anything with this: the permission checks are left to the * higher-level routines. */ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); struct udf_fileident_iter oiter, niter, diriter; bool has_diriter = false, is_dir = false; int retval; struct kernel_lb_addr tloc; if (flags & ~RENAME_NOREPLACE) return -EINVAL; retval = udf_fiiter_find_entry(old_dir, &old_dentry->d_name, &oiter); if (retval) return retval; tloc = lelb_to_cpu(oiter.fi.icb.extLocation); if (udf_get_lb_pblock(old_dir->i_sb, &tloc, 0) != old_inode->i_ino) { retval = -ENOENT; goto out_oiter; } if (S_ISDIR(old_inode->i_mode)) { if (new_inode) { retval = -ENOTEMPTY; if (!empty_dir(new_inode)) goto out_oiter; } is_dir = true; } if (is_dir && old_dir != new_dir) { retval = udf_fiiter_find_entry(old_inode, &dotdot_name, &diriter); if (retval == -ENOENT) { udf_err(old_inode->i_sb, "directory (ino %lu) has no '..' entry\n", old_inode->i_ino); retval = -EFSCORRUPTED; } if (retval) goto out_oiter; has_diriter = true; tloc = lelb_to_cpu(diriter.fi.icb.extLocation); if (udf_get_lb_pblock(old_inode->i_sb, &tloc, 0) != old_dir->i_ino) { retval = -EFSCORRUPTED; udf_err(old_inode->i_sb, "directory (ino %lu) has parent entry pointing to another inode (%lu != %u)\n", old_inode->i_ino, old_dir->i_ino, udf_get_lb_pblock(old_inode->i_sb, &tloc, 0)); goto out_oiter; } } retval = udf_fiiter_find_entry(new_dir, &new_dentry->d_name, &niter); if (retval && retval != -ENOENT) goto out_oiter; /* Entry found but not passed by VFS? */ if (!retval && !new_inode) { retval = -EFSCORRUPTED; udf_fiiter_release(&niter); goto out_oiter; } /* Entry not found? Need to add one... */ if (retval) { udf_fiiter_release(&niter); retval = udf_fiiter_add_entry(new_dir, new_dentry, &niter); if (retval) goto out_oiter; } /* * Like most other Unix systems, set the ctime for inodes on a * rename. */ inode_set_ctime_current(old_inode); mark_inode_dirty(old_inode); /* * ok, that's it */ niter.fi.fileVersionNum = oiter.fi.fileVersionNum; niter.fi.fileCharacteristics = oiter.fi.fileCharacteristics; memcpy(&(niter.fi.icb), &(oiter.fi.icb), sizeof(oiter.fi.icb)); udf_fiiter_write_fi(&niter, NULL); udf_fiiter_release(&niter); /* * The old entry may have moved due to new entry allocation. Find it * again. */ udf_fiiter_release(&oiter); retval = udf_fiiter_find_entry(old_dir, &old_dentry->d_name, &oiter); if (retval) { udf_err(old_dir->i_sb, "failed to find renamed entry again in directory (ino %lu)\n", old_dir->i_ino); } else { udf_fiiter_delete_entry(&oiter); udf_fiiter_release(&oiter); } if (new_inode) { inode_set_ctime_current(new_inode); inode_dec_link_count(new_inode); udf_add_fid_counter(old_dir->i_sb, S_ISDIR(new_inode->i_mode), -1); } inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir)); inode_set_mtime_to_ts(new_dir, inode_set_ctime_current(new_dir)); mark_inode_dirty(old_dir); mark_inode_dirty(new_dir); if (has_diriter) { diriter.fi.icb.extLocation = cpu_to_lelb(UDF_I(new_dir)->i_location); udf_update_tag((char *)&diriter.fi, udf_dir_entry_len(&diriter.fi)); udf_fiiter_write_fi(&diriter, NULL); udf_fiiter_release(&diriter); } if (is_dir) { inode_dec_link_count(old_dir); if (new_inode) inode_dec_link_count(new_inode); else { inc_nlink(new_dir); mark_inode_dirty(new_dir); } } return 0; out_oiter: if (has_diriter) udf_fiiter_release(&diriter); udf_fiiter_release(&oiter); return retval; } static struct dentry *udf_get_parent(struct dentry *child) { struct kernel_lb_addr tloc; struct udf_fileident_iter iter; int err; err = udf_fiiter_find_entry(d_inode(child), &dotdot_name, &iter); if (err) return ERR_PTR(err); tloc = lelb_to_cpu(iter.fi.icb.extLocation); udf_fiiter_release(&iter); return d_obtain_alias(udf_iget(child->d_sb, &tloc)); } static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block, u16 partref, __u32 generation) { struct inode *inode; struct kernel_lb_addr loc; if (block == 0) return ERR_PTR(-ESTALE); loc.logicalBlockNum = block; loc.partitionReferenceNum = partref; inode = udf_iget(sb, &loc); if (IS_ERR(inode)) return ERR_CAST(inode); if (generation && inode->i_generation != generation) { iput(inode); return ERR_PTR(-ESTALE); } return d_obtain_alias(inode); } static struct dentry *udf_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { if (fh_len < 3 || (fh_type != FILEID_UDF_WITH_PARENT && fh_type != FILEID_UDF_WITHOUT_PARENT)) return NULL; return udf_nfs_get_inode(sb, fid->udf.block, fid->udf.partref, fid->udf.generation); } static struct dentry *udf_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { if (fh_len < 5 || fh_type != FILEID_UDF_WITH_PARENT) return NULL; return udf_nfs_get_inode(sb, fid->udf.parent_block, fid->udf.parent_partref, fid->udf.parent_generation); } static int udf_encode_fh(struct inode *inode, __u32 *fh, int *lenp, struct inode *parent) { int len = *lenp; struct kernel_lb_addr location = UDF_I(inode)->i_location; struct fid *fid = (struct fid *)fh; int type = FILEID_UDF_WITHOUT_PARENT; if (parent && (len < 5)) { *lenp = 5; return FILEID_INVALID; } else if (len < 3) { *lenp = 3; return FILEID_INVALID; } *lenp = 3; fid->udf.block = location.logicalBlockNum; fid->udf.partref = location.partitionReferenceNum; fid->udf.parent_partref = 0; fid->udf.generation = inode->i_generation; if (parent) { location = UDF_I(parent)->i_location; fid->udf.parent_block = location.logicalBlockNum; fid->udf.parent_partref = location.partitionReferenceNum; fid->udf.parent_generation = inode->i_generation; *lenp = 5; type = FILEID_UDF_WITH_PARENT; } return type; } const struct export_operations udf_export_ops = { .encode_fh = udf_encode_fh, .fh_to_dentry = udf_fh_to_dentry, .fh_to_parent = udf_fh_to_parent, .get_parent = udf_get_parent, }; const struct inode_operations udf_dir_inode_operations = { .lookup = udf_lookup, .create = udf_create, .link = udf_link, .unlink = udf_unlink, .symlink = udf_symlink, .mkdir = udf_mkdir, .rmdir = udf_rmdir, .mknod = udf_mknod, .rename = udf_rename, .tmpfile = udf_tmpfile, };
1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 // SPDX-License-Identifier: GPL-2.0-or-later /* * Virtual PTP 1588 clock for use with KVM guests * * Copyright (C) 2017 Red Hat Inc. */ #include <linux/device.h> #include <linux/kernel.h> #include <asm/pvclock.h> #include <asm/kvmclock.h> #include <linux/module.h> #include <uapi/asm/kvm_para.h> #include <uapi/linux/kvm_para.h> #include <linux/ptp_clock_kernel.h> #include <linux/ptp_kvm.h> #include <linux/set_memory.h> static phys_addr_t clock_pair_gpa; static struct kvm_clock_pairing clock_pair_glbl; static struct kvm_clock_pairing *clock_pair; int kvm_arch_ptp_init(void) { struct page *p; long ret; if (!kvm_para_available()) return -ENODEV; if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { p = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!p) return -ENOMEM; clock_pair = page_address(p); ret = set_memory_decrypted((unsigned long)clock_pair, 1); if (ret) { __free_page(p); clock_pair = NULL; goto nofree; } } else { clock_pair = &clock_pair_glbl; } clock_pair_gpa = slow_virt_to_phys(clock_pair); if (!pvclock_get_pvti_cpu0_va()) { ret = -ENODEV; goto err; } ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa, KVM_CLOCK_PAIRING_WALLCLOCK); if (ret == -KVM_ENOSYS) { ret = -ENODEV; goto err; } return ret; err: kvm_arch_ptp_exit(); nofree: return ret; } void kvm_arch_ptp_exit(void) { if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { WARN_ON(set_memory_encrypted((unsigned long)clock_pair, 1)); free_page((unsigned long)clock_pair); clock_pair = NULL; } } int kvm_arch_ptp_get_clock(struct timespec64 *ts) { long ret; ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa, KVM_CLOCK_PAIRING_WALLCLOCK); if (ret != 0) { pr_err_ratelimited("clock offset hypercall ret %lu\n", ret); return -EOPNOTSUPP; } ts->tv_sec = clock_pair->sec; ts->tv_nsec = clock_pair->nsec; return 0; } int kvm_arch_ptp_get_crosststamp(u64 *cycle, struct timespec64 *tspec, struct clocksource **cs) { struct pvclock_vcpu_time_info *src; unsigned int version; long ret; src = this_cpu_pvti(); do { /* * We are using a TSC value read in the hosts * kvm_hc_clock_pairing handling. * So any changes to tsc_to_system_mul * and tsc_shift or any other pvclock * data invalidate that measurement. */ version = pvclock_read_begin(src); ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa, KVM_CLOCK_PAIRING_WALLCLOCK); if (ret != 0) { pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret); return -EOPNOTSUPP; } tspec->tv_sec = clock_pair->sec; tspec->tv_nsec = clock_pair->nsec; *cycle = __pvclock_read_cycles(src, clock_pair->tsc); } while (pvclock_read_retry(src, version)); *cs = &kvm_clock; return 0; }
3 2 1 11 12 12 8 8 5 8 15 13 1 1 3 11 1 11 21 12 8 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/proc/root.c * * Copyright (C) 1991, 1992 Linus Torvalds * * proc root directory handling functions */ #include <linux/errno.h> #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/sched/stat.h> #include <linux/module.h> #include <linux/bitops.h> #include <linux/user_namespace.h> #include <linux/fs_context.h> #include <linux/mount.h> #include <linux/pid_namespace.h> #include <linux/fs_parser.h> #include <linux/cred.h> #include <linux/magic.h> #include <linux/slab.h> #include "internal.h" struct proc_fs_context { struct pid_namespace *pid_ns; unsigned int mask; enum proc_hidepid hidepid; int gid; enum proc_pidonly pidonly; }; enum proc_param { Opt_gid, Opt_hidepid, Opt_subset, }; static const struct fs_parameter_spec proc_fs_parameters[] = { fsparam_u32("gid", Opt_gid), fsparam_string("hidepid", Opt_hidepid), fsparam_string("subset", Opt_subset), {} }; static inline int valid_hidepid(unsigned int value) { return (value == HIDEPID_OFF || value == HIDEPID_NO_ACCESS || value == HIDEPID_INVISIBLE || value == HIDEPID_NOT_PTRACEABLE); } static int proc_parse_hidepid_param(struct fs_context *fc, struct fs_parameter *param) { struct proc_fs_context *ctx = fc->fs_private; struct fs_parameter_spec hidepid_u32_spec = fsparam_u32("hidepid", Opt_hidepid); struct fs_parse_result result; int base = (unsigned long)hidepid_u32_spec.data; if (param->type != fs_value_is_string) return invalf(fc, "proc: unexpected type of hidepid value\n"); if (!kstrtouint(param->string, base, &result.uint_32)) { if (!valid_hidepid(result.uint_32)) return invalf(fc, "proc: unknown value of hidepid - %s\n", param->string); ctx->hidepid = result.uint_32; return 0; } if (!strcmp(param->string, "off")) ctx->hidepid = HIDEPID_OFF; else if (!strcmp(param->string, "noaccess")) ctx->hidepid = HIDEPID_NO_ACCESS; else if (!strcmp(param->string, "invisible")) ctx->hidepid = HIDEPID_INVISIBLE; else if (!strcmp(param->string, "ptraceable")) ctx->hidepid = HIDEPID_NOT_PTRACEABLE; else return invalf(fc, "proc: unknown value of hidepid - %s\n", param->string); return 0; } static int proc_parse_subset_param(struct fs_context *fc, char *value) { struct proc_fs_context *ctx = fc->fs_private; while (value) { char *ptr = strchr(value, ','); if (ptr != NULL) *ptr++ = '\0'; if (*value != '\0') { if (!strcmp(value, "pid")) { ctx->pidonly = PROC_PIDONLY_ON; } else { return invalf(fc, "proc: unsupported subset option - %s\n", value); } } value = ptr; } return 0; } static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct proc_fs_context *ctx = fc->fs_private; struct fs_parse_result result; int opt; opt = fs_parse(fc, proc_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_gid: ctx->gid = result.uint_32; break; case Opt_hidepid: if (proc_parse_hidepid_param(fc, param)) return -EINVAL; break; case Opt_subset: if (proc_parse_subset_param(fc, param->string) < 0) return -EINVAL; break; default: return -EINVAL; } ctx->mask |= 1 << opt; return 0; } static void proc_apply_options(struct proc_fs_info *fs_info, struct fs_context *fc, struct user_namespace *user_ns) { struct proc_fs_context *ctx = fc->fs_private; if (ctx->mask & (1 << Opt_gid)) fs_info->pid_gid = make_kgid(user_ns, ctx->gid); if (ctx->mask & (1 << Opt_hidepid)) fs_info->hide_pid = ctx->hidepid; if (ctx->mask & (1 << Opt_subset)) fs_info->pidonly = ctx->pidonly; } static int proc_fill_super(struct super_block *s, struct fs_context *fc) { struct proc_fs_context *ctx = fc->fs_private; struct inode *root_inode; struct proc_fs_info *fs_info; int ret; fs_info = kzalloc(sizeof(*fs_info), GFP_KERNEL); if (!fs_info) return -ENOMEM; fs_info->pid_ns = get_pid_ns(ctx->pid_ns); proc_apply_options(fs_info, fc, current_user_ns()); /* User space would break if executables or devices appear on proc */ s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC; s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = PROC_SUPER_MAGIC; s->s_op = &proc_sops; s->s_time_gran = 1; s->s_fs_info = fs_info; /* * procfs isn't actually a stacking filesystem; however, there is * too much magic going on inside it to permit stacking things on * top of it */ s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; /* procfs dentries and inodes don't require IO to create */ s->s_shrink->seeks = 0; pde_get(&proc_root); root_inode = proc_get_inode(s, &proc_root); if (!root_inode) { pr_err("proc_fill_super: get root inode failed\n"); return -ENOMEM; } s->s_root = d_make_root(root_inode); if (!s->s_root) { pr_err("proc_fill_super: allocate dentry failed\n"); return -ENOMEM; } ret = proc_setup_self(s); if (ret) { return ret; } return proc_setup_thread_self(s); } static int proc_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct proc_fs_info *fs_info = proc_sb_info(sb); sync_filesystem(sb); proc_apply_options(fs_info, fc, current_user_ns()); return 0; } static int proc_get_tree(struct fs_context *fc) { return get_tree_nodev(fc, proc_fill_super); } static void proc_fs_context_free(struct fs_context *fc) { struct proc_fs_context *ctx = fc->fs_private; put_pid_ns(ctx->pid_ns); kfree(ctx); } static const struct fs_context_operations proc_fs_context_ops = { .free = proc_fs_context_free, .parse_param = proc_parse_param, .get_tree = proc_get_tree, .reconfigure = proc_reconfigure, }; static int proc_init_fs_context(struct fs_context *fc) { struct proc_fs_context *ctx; ctx = kzalloc(sizeof(struct proc_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->pid_ns = get_pid_ns(task_active_pid_ns(current)); put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(ctx->pid_ns->user_ns); fc->fs_private = ctx; fc->ops = &proc_fs_context_ops; return 0; } static void proc_kill_sb(struct super_block *sb) { struct proc_fs_info *fs_info = proc_sb_info(sb); if (!fs_info) { kill_anon_super(sb); return; } dput(fs_info->proc_self); dput(fs_info->proc_thread_self); kill_anon_super(sb); put_pid_ns(fs_info->pid_ns); kfree(fs_info); } static struct file_system_type proc_fs_type = { .name = "proc", .init_fs_context = proc_init_fs_context, .parameters = proc_fs_parameters, .kill_sb = proc_kill_sb, .fs_flags = FS_USERNS_MOUNT | FS_DISALLOW_NOTIFY_PERM, }; void __init proc_root_init(void) { proc_init_kmemcache(); set_proc_pid_nlink(); proc_self_init(); proc_thread_self_init(); proc_symlink("mounts", NULL, "self/mounts"); proc_net_init(); proc_mkdir("fs", NULL); proc_mkdir("driver", NULL); proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */ #if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE) /* just give it a mountpoint */ proc_create_mount_point("openprom"); #endif proc_tty_init(); proc_mkdir("bus", NULL); proc_sys_init(); /* * Last things last. It is not like userspace processes eager * to open /proc files exist at this point but register last * anyway. */ register_filesystem(&proc_fs_type); } static int proc_root_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry), stat); stat->nlink = proc_root.nlink + nr_processes(); return 0; } static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) { if (!proc_pid_lookup(dentry, flags)) return NULL; return proc_lookup(dir, dentry, flags); } static int proc_root_readdir(struct file *file, struct dir_context *ctx) { if (ctx->pos < FIRST_PROCESS_ENTRY) { int error = proc_readdir(file, ctx); if (unlikely(error <= 0)) return error; ctx->pos = FIRST_PROCESS_ENTRY; } return proc_pid_readdir(file, ctx); } /* * The root /proc directory is special, as it has the * <pid> directories. Thus we don't use the generic * directory handling functions for that.. */ static const struct file_operations proc_root_operations = { .read = generic_read_dir, .iterate_shared = proc_root_readdir, .llseek = generic_file_llseek, }; /* * proc root can do almost nothing.. */ static const struct inode_operations proc_root_inode_operations = { .lookup = proc_root_lookup, .getattr = proc_root_getattr, }; /* * This is the root "inode" in the /proc tree.. */ struct proc_dir_entry proc_root = { .low_ino = PROC_ROOT_INO, .namelen = 5, .mode = S_IFDIR | S_IRUGO | S_IXUGO, .nlink = 2, .refcnt = REFCOUNT_INIT(1), .proc_iops = &proc_root_inode_operations, .proc_dir_ops = &proc_root_operations, .parent = &proc_root, .subdir = RB_ROOT, .name = "/proc", };
8 2 6 4 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/module.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_socket.h> #include <net/inet_sock.h> #include <net/tcp.h> struct nft_socket { enum nft_socket_keys key:8; u8 level; u8 len; union { u8 dreg; }; }; static void nft_socket_wildcard(const struct nft_pktinfo *pkt, struct nft_regs *regs, struct sock *sk, u32 *dest) { switch (nft_pf(pkt)) { case NFPROTO_IPV4: nft_reg_store8(dest, inet_sk(sk)->inet_rcv_saddr == 0); break; #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case NFPROTO_IPV6: nft_reg_store8(dest, ipv6_addr_any(&sk->sk_v6_rcv_saddr)); break; #endif default: regs->verdict.code = NFT_BREAK; return; } } #ifdef CONFIG_SOCK_CGROUP_DATA static noinline bool nft_sock_get_eval_cgroupv2(u32 *dest, struct sock *sk, const struct nft_pktinfo *pkt, u32 level) { struct cgroup *cgrp; u64 cgid; if (!sk_fullsock(sk)) return false; cgrp = cgroup_ancestor(sock_cgroup_ptr(&sk->sk_cgrp_data), level); if (!cgrp) return false; cgid = cgroup_id(cgrp); memcpy(dest, &cgid, sizeof(u64)); return true; } #endif static struct sock *nft_socket_do_lookup(const struct nft_pktinfo *pkt) { const struct net_device *indev = nft_in(pkt); const struct sk_buff *skb = pkt->skb; struct sock *sk = NULL; if (!indev) return NULL; switch (nft_pf(pkt)) { case NFPROTO_IPV4: sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, indev); break; #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case NFPROTO_IPV6: sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, indev); break; #endif default: WARN_ON_ONCE(1); break; } return sk; } static void nft_socket_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_socket *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; struct sock *sk = skb->sk; u32 *dest = &regs->data[priv->dreg]; if (sk && !net_eq(nft_net(pkt), sock_net(sk))) sk = NULL; if (!sk) sk = nft_socket_do_lookup(pkt); if (!sk) { regs->verdict.code = NFT_BREAK; return; } switch(priv->key) { case NFT_SOCKET_TRANSPARENT: nft_reg_store8(dest, inet_sk_transparent(sk)); break; case NFT_SOCKET_MARK: if (sk_fullsock(sk)) { *dest = READ_ONCE(sk->sk_mark); } else { regs->verdict.code = NFT_BREAK; return; } break; case NFT_SOCKET_WILDCARD: if (!sk_fullsock(sk)) { regs->verdict.code = NFT_BREAK; return; } nft_socket_wildcard(pkt, regs, sk, dest); break; #ifdef CONFIG_SOCK_CGROUP_DATA case NFT_SOCKET_CGROUPV2: if (!nft_sock_get_eval_cgroupv2(dest, sk, pkt, priv->level)) { regs->verdict.code = NFT_BREAK; return; } break; #endif default: WARN_ON(1); regs->verdict.code = NFT_BREAK; } if (sk != skb->sk) sock_gen_put(sk); } static const struct nla_policy nft_socket_policy[NFTA_SOCKET_MAX + 1] = { [NFTA_SOCKET_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_SOCKET_DREG] = { .type = NLA_U32 }, [NFTA_SOCKET_LEVEL] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_socket_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_socket *priv = nft_expr_priv(expr); unsigned int len; if (!tb[NFTA_SOCKET_DREG] || !tb[NFTA_SOCKET_KEY]) return -EINVAL; switch(ctx->family) { case NFPROTO_IPV4: #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case NFPROTO_IPV6: #endif case NFPROTO_INET: break; default: return -EOPNOTSUPP; } priv->key = ntohl(nla_get_be32(tb[NFTA_SOCKET_KEY])); switch(priv->key) { case NFT_SOCKET_TRANSPARENT: case NFT_SOCKET_WILDCARD: len = sizeof(u8); break; case NFT_SOCKET_MARK: len = sizeof(u32); break; #ifdef CONFIG_CGROUPS case NFT_SOCKET_CGROUPV2: { unsigned int level; if (!tb[NFTA_SOCKET_LEVEL]) return -EINVAL; level = ntohl(nla_get_be32(tb[NFTA_SOCKET_LEVEL])); if (level > 255) return -EOPNOTSUPP; priv->level = level; len = sizeof(u64); break; } #endif default: return -EOPNOTSUPP; } priv->len = len; return nft_parse_register_store(ctx, tb[NFTA_SOCKET_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, len); } static int nft_socket_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_socket *priv = nft_expr_priv(expr); if (nla_put_be32(skb, NFTA_SOCKET_KEY, htonl(priv->key))) return -1; if (nft_dump_register(skb, NFTA_SOCKET_DREG, priv->dreg)) return -1; if (priv->key == NFT_SOCKET_CGROUPV2 && nla_put_be32(skb, NFTA_SOCKET_LEVEL, htonl(priv->level))) return -1; return 0; } static bool nft_socket_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { const struct nft_socket *priv = nft_expr_priv(expr); const struct nft_socket *socket; if (!nft_reg_track_cmp(track, expr, priv->dreg)) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } socket = nft_expr_priv(track->regs[priv->dreg].selector); if (priv->key != socket->key || priv->dreg != socket->dreg || priv->level != socket->level) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } if (!track->regs[priv->dreg].bitwise) return true; return nft_expr_reduce_bitwise(track, expr); } static int nft_socket_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) { return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_LOCAL_OUT)); } static struct nft_expr_type nft_socket_type; static const struct nft_expr_ops nft_socket_ops = { .type = &nft_socket_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_socket)), .eval = nft_socket_eval, .init = nft_socket_init, .dump = nft_socket_dump, .validate = nft_socket_validate, .reduce = nft_socket_reduce, }; static struct nft_expr_type nft_socket_type __read_mostly = { .name = "socket", .ops = &nft_socket_ops, .policy = nft_socket_policy, .maxattr = NFTA_SOCKET_MAX, .owner = THIS_MODULE, }; static int __init nft_socket_module_init(void) { return nft_register_expr(&nft_socket_type); } static void __exit nft_socket_module_exit(void) { nft_unregister_expr(&nft_socket_type); } module_init(nft_socket_module_init); module_exit(nft_socket_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Máté Eckl"); MODULE_DESCRIPTION("nf_tables socket match module"); MODULE_ALIAS_NFT_EXPR("socket");
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 // SPDX-License-Identifier: GPL-2.0-only /* * keyspan_remote: USB driver for the Keyspan DMR * * Copyright (C) 2005 Zymeta Corporation - Michael Downey (downey@zymeta.com) * * This driver has been put together with the support of Innosys, Inc. * and Keyspan, Inc the manufacturers of the Keyspan USB DMR product. */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/usb/input.h> /* Parameters that can be passed to the driver. */ static int debug; module_param(debug, int, 0444); MODULE_PARM_DESC(debug, "Enable extra debug messages and information"); /* Vendor and product ids */ #define USB_KEYSPAN_VENDOR_ID 0x06CD #define USB_KEYSPAN_PRODUCT_UIA11 0x0202 /* Defines for converting the data from the remote. */ #define ZERO 0x18 #define ZERO_MASK 0x1F /* 5 bits for a 0 */ #define ONE 0x3C #define ONE_MASK 0x3F /* 6 bits for a 1 */ #define SYNC 0x3F80 #define SYNC_MASK 0x3FFF /* 14 bits for a SYNC sequence */ #define STOP 0x00 #define STOP_MASK 0x1F /* 5 bits for the STOP sequence */ #define GAP 0xFF #define RECV_SIZE 8 /* The UIA-11 type have a 8 byte limit. */ /* * Table that maps the 31 possible keycodes to input keys. * Currently there are 15 and 17 button models so RESERVED codes * are blank areas in the mapping. */ static const unsigned short keyspan_key_table[] = { KEY_RESERVED, /* 0 is just a place holder. */ KEY_RESERVED, KEY_STOP, KEY_PLAYCD, KEY_RESERVED, KEY_PREVIOUSSONG, KEY_REWIND, KEY_FORWARD, KEY_NEXTSONG, KEY_RESERVED, KEY_RESERVED, KEY_RESERVED, KEY_PAUSE, KEY_VOLUMEUP, KEY_RESERVED, KEY_RESERVED, KEY_RESERVED, KEY_VOLUMEDOWN, KEY_RESERVED, KEY_UP, KEY_RESERVED, KEY_MUTE, KEY_LEFT, KEY_ENTER, KEY_RIGHT, KEY_RESERVED, KEY_RESERVED, KEY_DOWN, KEY_RESERVED, KEY_KPASTERISK, KEY_RESERVED, KEY_MENU }; /* table of devices that work with this driver */ static const struct usb_device_id keyspan_table[] = { { USB_DEVICE(USB_KEYSPAN_VENDOR_ID, USB_KEYSPAN_PRODUCT_UIA11) }, { } /* Terminating entry */ }; /* Structure to store all the real stuff that a remote sends to us. */ struct keyspan_message { u16 system; u8 button; u8 toggle; }; /* Structure used for all the bit testing magic needed to be done. */ struct bit_tester { u32 tester; int len; int pos; int bits_left; u8 buffer[32]; }; /* Structure to hold all of our driver specific stuff */ struct usb_keyspan { char name[128]; char phys[64]; unsigned short keymap[ARRAY_SIZE(keyspan_key_table)]; struct usb_device *udev; struct input_dev *input; struct usb_interface *interface; struct usb_endpoint_descriptor *in_endpoint; struct urb* irq_urb; int open; dma_addr_t in_dma; unsigned char *in_buffer; /* variables used to parse messages from remote. */ struct bit_tester data; int stage; int toggle; }; static struct usb_driver keyspan_driver; /* * Debug routine that prints out what we've received from the remote. */ static void keyspan_print(struct usb_keyspan* dev) /*unsigned char* data)*/ { char codes[4 * RECV_SIZE]; int i; for (i = 0; i < RECV_SIZE; i++) snprintf(codes + i * 3, 4, "%02x ", dev->in_buffer[i]); dev_info(&dev->udev->dev, "%s\n", codes); } /* * Routine that manages the bit_tester structure. It makes sure that there are * at least bits_needed bits loaded into the tester. */ static int keyspan_load_tester(struct usb_keyspan* dev, int bits_needed) { if (dev->data.bits_left >= bits_needed) return 0; /* * Somehow we've missed the last message. The message will be repeated * though so it's not too big a deal */ if (dev->data.pos >= dev->data.len) { dev_dbg(&dev->interface->dev, "%s - Error ran out of data. pos: %d, len: %d\n", __func__, dev->data.pos, dev->data.len); return -1; } /* Load as much as we can into the tester. */ while ((dev->data.bits_left + 7 < (sizeof(dev->data.tester) * 8)) && (dev->data.pos < dev->data.len)) { dev->data.tester += (dev->data.buffer[dev->data.pos++] << dev->data.bits_left); dev->data.bits_left += 8; } return 0; } static void keyspan_report_button(struct usb_keyspan *remote, int button, int press) { struct input_dev *input = remote->input; input_event(input, EV_MSC, MSC_SCAN, button); input_report_key(input, remote->keymap[button], press); input_sync(input); } /* * Routine that handles all the logic needed to parse out the message from the remote. */ static void keyspan_check_data(struct usb_keyspan *remote) { int i; int found = 0; struct keyspan_message message; switch(remote->stage) { case 0: /* * In stage 0 we want to find the start of a message. The remote sends a 0xFF as filler. * So the first byte that isn't a FF should be the start of a new message. */ for (i = 0; i < RECV_SIZE && remote->in_buffer[i] == GAP; ++i); if (i < RECV_SIZE) { memcpy(remote->data.buffer, remote->in_buffer, RECV_SIZE); remote->data.len = RECV_SIZE; remote->data.pos = 0; remote->data.tester = 0; remote->data.bits_left = 0; remote->stage = 1; } break; case 1: /* * Stage 1 we should have 16 bytes and should be able to detect a * SYNC. The SYNC is 14 bits, 7 0's and then 7 1's. */ memcpy(remote->data.buffer + remote->data.len, remote->in_buffer, RECV_SIZE); remote->data.len += RECV_SIZE; found = 0; while ((remote->data.bits_left >= 14 || remote->data.pos < remote->data.len) && !found) { for (i = 0; i < 8; ++i) { if (keyspan_load_tester(remote, 14) != 0) { remote->stage = 0; return; } if ((remote->data.tester & SYNC_MASK) == SYNC) { remote->data.tester = remote->data.tester >> 14; remote->data.bits_left -= 14; found = 1; break; } else { remote->data.tester = remote->data.tester >> 1; --remote->data.bits_left; } } } if (!found) { remote->stage = 0; remote->data.len = 0; } else { remote->stage = 2; } break; case 2: /* * Stage 2 we should have 24 bytes which will be enough for a full * message. We need to parse out the system code, button code, * toggle code, and stop. */ memcpy(remote->data.buffer + remote->data.len, remote->in_buffer, RECV_SIZE); remote->data.len += RECV_SIZE; message.system = 0; for (i = 0; i < 9; i++) { keyspan_load_tester(remote, 6); if ((remote->data.tester & ZERO_MASK) == ZERO) { message.system = message.system << 1; remote->data.tester = remote->data.tester >> 5; remote->data.bits_left -= 5; } else if ((remote->data.tester & ONE_MASK) == ONE) { message.system = (message.system << 1) + 1; remote->data.tester = remote->data.tester >> 6; remote->data.bits_left -= 6; } else { dev_err(&remote->interface->dev, "%s - Unknown sequence found in system data.\n", __func__); remote->stage = 0; return; } } message.button = 0; for (i = 0; i < 5; i++) { keyspan_load_tester(remote, 6); if ((remote->data.tester & ZERO_MASK) == ZERO) { message.button = message.button << 1; remote->data.tester = remote->data.tester >> 5; remote->data.bits_left -= 5; } else if ((remote->data.tester & ONE_MASK) == ONE) { message.button = (message.button << 1) + 1; remote->data.tester = remote->data.tester >> 6; remote->data.bits_left -= 6; } else { dev_err(&remote->interface->dev, "%s - Unknown sequence found in button data.\n", __func__); remote->stage = 0; return; } } keyspan_load_tester(remote, 6); if ((remote->data.tester & ZERO_MASK) == ZERO) { message.toggle = 0; remote->data.tester = remote->data.tester >> 5; remote->data.bits_left -= 5; } else if ((remote->data.tester & ONE_MASK) == ONE) { message.toggle = 1; remote->data.tester = remote->data.tester >> 6; remote->data.bits_left -= 6; } else { dev_err(&remote->interface->dev, "%s - Error in message, invalid toggle.\n", __func__); remote->stage = 0; return; } keyspan_load_tester(remote, 5); if ((remote->data.tester & STOP_MASK) == STOP) { remote->data.tester = remote->data.tester >> 5; remote->data.bits_left -= 5; } else { dev_err(&remote->interface->dev, "Bad message received, no stop bit found.\n"); } dev_dbg(&remote->interface->dev, "%s found valid message: system: %d, button: %d, toggle: %d\n", __func__, message.system, message.button, message.toggle); if (message.toggle != remote->toggle) { keyspan_report_button(remote, message.button, 1); keyspan_report_button(remote, message.button, 0); remote->toggle = message.toggle; } remote->stage = 0; break; } } /* * Routine for sending all the initialization messages to the remote. */ static int keyspan_setup(struct usb_device* dev) { int retval = 0; retval = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), 0x11, 0x40, 0x5601, 0x0, NULL, 0, USB_CTRL_SET_TIMEOUT); if (retval) { dev_dbg(&dev->dev, "%s - failed to set bit rate due to error: %d\n", __func__, retval); return(retval); } retval = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), 0x44, 0x40, 0x0, 0x0, NULL, 0, USB_CTRL_SET_TIMEOUT); if (retval) { dev_dbg(&dev->dev, "%s - failed to set resume sensitivity due to error: %d\n", __func__, retval); return(retval); } retval = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), 0x22, 0x40, 0x0, 0x0, NULL, 0, USB_CTRL_SET_TIMEOUT); if (retval) { dev_dbg(&dev->dev, "%s - failed to turn receive on due to error: %d\n", __func__, retval); return(retval); } dev_dbg(&dev->dev, "%s - Setup complete.\n", __func__); return(retval); } /* * Routine used to handle a new message that has come in. */ static void keyspan_irq_recv(struct urb *urb) { struct usb_keyspan *dev = urb->context; int retval; /* Check our status in case we need to bail out early. */ switch (urb->status) { case 0: break; /* Device went away so don't keep trying to read from it. */ case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: return; default: goto resubmit; } if (debug) keyspan_print(dev); keyspan_check_data(dev); resubmit: retval = usb_submit_urb(urb, GFP_ATOMIC); if (retval) dev_err(&dev->interface->dev, "%s - usb_submit_urb failed with result: %d\n", __func__, retval); } static int keyspan_open(struct input_dev *dev) { struct usb_keyspan *remote = input_get_drvdata(dev); remote->irq_urb->dev = remote->udev; if (usb_submit_urb(remote->irq_urb, GFP_KERNEL)) return -EIO; return 0; } static void keyspan_close(struct input_dev *dev) { struct usb_keyspan *remote = input_get_drvdata(dev); usb_kill_urb(remote->irq_urb); } static struct usb_endpoint_descriptor *keyspan_get_in_endpoint(struct usb_host_interface *iface) { struct usb_endpoint_descriptor *endpoint; int i; for (i = 0; i < iface->desc.bNumEndpoints; ++i) { endpoint = &iface->endpoint[i].desc; if (usb_endpoint_is_int_in(endpoint)) { /* we found our interrupt in endpoint */ return endpoint; } } return NULL; } /* * Routine that sets up the driver to handle a specific USB device detected on the bus. */ static int keyspan_probe(struct usb_interface *interface, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev(interface); struct usb_endpoint_descriptor *endpoint; struct usb_keyspan *remote; struct input_dev *input_dev; int i, error; endpoint = keyspan_get_in_endpoint(interface->cur_altsetting); if (!endpoint) return -ENODEV; remote = kzalloc(sizeof(*remote), GFP_KERNEL); input_dev = input_allocate_device(); if (!remote || !input_dev) { error = -ENOMEM; goto fail1; } remote->udev = udev; remote->input = input_dev; remote->interface = interface; remote->in_endpoint = endpoint; remote->toggle = -1; /* Set to -1 so we will always not match the toggle from the first remote message. */ remote->in_buffer = usb_alloc_coherent(udev, RECV_SIZE, GFP_KERNEL, &remote->in_dma); if (!remote->in_buffer) { error = -ENOMEM; goto fail1; } remote->irq_urb = usb_alloc_urb(0, GFP_KERNEL); if (!remote->irq_urb) { error = -ENOMEM; goto fail2; } error = keyspan_setup(udev); if (error) { error = -ENODEV; goto fail3; } if (udev->manufacturer) strscpy(remote->name, udev->manufacturer, sizeof(remote->name)); if (udev->product) { if (udev->manufacturer) strlcat(remote->name, " ", sizeof(remote->name)); strlcat(remote->name, udev->product, sizeof(remote->name)); } if (!strlen(remote->name)) snprintf(remote->name, sizeof(remote->name), "USB Keyspan Remote %04x:%04x", le16_to_cpu(udev->descriptor.idVendor), le16_to_cpu(udev->descriptor.idProduct)); usb_make_path(udev, remote->phys, sizeof(remote->phys)); strlcat(remote->phys, "/input0", sizeof(remote->phys)); memcpy(remote->keymap, keyspan_key_table, sizeof(remote->keymap)); input_dev->name = remote->name; input_dev->phys = remote->phys; usb_to_input_id(udev, &input_dev->id); input_dev->dev.parent = &interface->dev; input_dev->keycode = remote->keymap; input_dev->keycodesize = sizeof(unsigned short); input_dev->keycodemax = ARRAY_SIZE(remote->keymap); input_set_capability(input_dev, EV_MSC, MSC_SCAN); __set_bit(EV_KEY, input_dev->evbit); for (i = 0; i < ARRAY_SIZE(keyspan_key_table); i++) __set_bit(keyspan_key_table[i], input_dev->keybit); __clear_bit(KEY_RESERVED, input_dev->keybit); input_set_drvdata(input_dev, remote); input_dev->open = keyspan_open; input_dev->close = keyspan_close; /* * Initialize the URB to access the device. * The urb gets sent to the device in keyspan_open() */ usb_fill_int_urb(remote->irq_urb, remote->udev, usb_rcvintpipe(remote->udev, endpoint->bEndpointAddress), remote->in_buffer, RECV_SIZE, keyspan_irq_recv, remote, endpoint->bInterval); remote->irq_urb->transfer_dma = remote->in_dma; remote->irq_urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; /* we can register the device now, as it is ready */ error = input_register_device(remote->input); if (error) goto fail3; /* save our data pointer in this interface device */ usb_set_intfdata(interface, remote); return 0; fail3: usb_free_urb(remote->irq_urb); fail2: usb_free_coherent(udev, RECV_SIZE, remote->in_buffer, remote->in_dma); fail1: kfree(remote); input_free_device(input_dev); return error; } /* * Routine called when a device is disconnected from the USB. */ static void keyspan_disconnect(struct usb_interface *interface) { struct usb_keyspan *remote; remote = usb_get_intfdata(interface); usb_set_intfdata(interface, NULL); if (remote) { /* We have a valid driver structure so clean up everything we allocated. */ input_unregister_device(remote->input); usb_kill_urb(remote->irq_urb); usb_free_urb(remote->irq_urb); usb_free_coherent(remote->udev, RECV_SIZE, remote->in_buffer, remote->in_dma); kfree(remote); } } /* * Standard driver set up sections */ static struct usb_driver keyspan_driver = { .name = "keyspan_remote", .probe = keyspan_probe, .disconnect = keyspan_disconnect, .id_table = keyspan_table }; module_usb_driver(keyspan_driver); MODULE_DEVICE_TABLE(usb, keyspan_table); MODULE_AUTHOR("Michael Downey <downey@zymeta.com>"); MODULE_DESCRIPTION("Driver for the USB Keyspan remote control."); MODULE_LICENSE("GPL");
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 // SPDX-License-Identifier: GPL-2.0-only /* * Software WEP encryption implementation * Copyright 2002, Jouni Malinen <jkmaline@cc.hut.fi> * Copyright 2003, Instant802 Networks, Inc. * Copyright (C) 2023 Intel Corporation */ #include <linux/netdevice.h> #include <linux/types.h> #include <linux/random.h> #include <linux/compiler.h> #include <linux/crc32.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/mm.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <asm/unaligned.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "wep.h" void ieee80211_wep_init(struct ieee80211_local *local) { /* start WEP IV from a random value */ get_random_bytes(&local->wep_iv, IEEE80211_WEP_IV_LEN); } static inline bool ieee80211_wep_weak_iv(u32 iv, int keylen) { /* * Fluhrer, Mantin, and Shamir have reported weaknesses in the * key scheduling algorithm of RC4. At least IVs (KeyByte + 3, * 0xff, N) can be used to speedup attacks, so avoid using them. */ if ((iv & 0xff00) == 0xff00) { u8 B = (iv >> 16) & 0xff; if (B >= 3 && B < 3 + keylen) return true; } return false; } static void ieee80211_wep_get_iv(struct ieee80211_local *local, int keylen, int keyidx, u8 *iv) { local->wep_iv++; if (ieee80211_wep_weak_iv(local->wep_iv, keylen)) local->wep_iv += 0x0100; if (!iv) return; *iv++ = (local->wep_iv >> 16) & 0xff; *iv++ = (local->wep_iv >> 8) & 0xff; *iv++ = local->wep_iv & 0xff; *iv++ = keyidx << 6; } static u8 *ieee80211_wep_add_iv(struct ieee80211_local *local, struct sk_buff *skb, int keylen, int keyidx) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); unsigned int hdrlen; u8 *newhdr; hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); if (WARN_ON(skb_headroom(skb) < IEEE80211_WEP_IV_LEN)) return NULL; hdrlen = ieee80211_hdrlen(hdr->frame_control); newhdr = skb_push(skb, IEEE80211_WEP_IV_LEN); memmove(newhdr, newhdr + IEEE80211_WEP_IV_LEN, hdrlen); /* the HW only needs room for the IV, but not the actual IV */ if (info->control.hw_key && (info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)) return newhdr + hdrlen; ieee80211_wep_get_iv(local, keylen, keyidx, newhdr + hdrlen); return newhdr + hdrlen; } static void ieee80211_wep_remove_iv(struct ieee80211_local *local, struct sk_buff *skb, struct ieee80211_key *key) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; unsigned int hdrlen; hdrlen = ieee80211_hdrlen(hdr->frame_control); memmove(skb->data + IEEE80211_WEP_IV_LEN, skb->data, hdrlen); skb_pull(skb, IEEE80211_WEP_IV_LEN); } /* Perform WEP encryption using given key. data buffer must have tailroom * for 4-byte ICV. data_len must not include this ICV. Note: this function * does _not_ add IV. data = RC4(data | CRC32(data)) */ int ieee80211_wep_encrypt_data(struct arc4_ctx *ctx, u8 *rc4key, size_t klen, u8 *data, size_t data_len) { __le32 icv; icv = cpu_to_le32(~crc32_le(~0, data, data_len)); put_unaligned(icv, (__le32 *)(data + data_len)); arc4_setkey(ctx, rc4key, klen); arc4_crypt(ctx, data, data, data_len + IEEE80211_WEP_ICV_LEN); memzero_explicit(ctx, sizeof(*ctx)); return 0; } /* Perform WEP encryption on given skb. 4 bytes of extra space (IV) in the * beginning of the buffer 4 bytes of extra space (ICV) in the end of the * buffer will be added. Both IV and ICV will be transmitted, so the * payload length increases with 8 bytes. * * WEP frame payload: IV + TX key idx, RC4(data), ICV = RC4(CRC32(data)) */ int ieee80211_wep_encrypt(struct ieee80211_local *local, struct sk_buff *skb, const u8 *key, int keylen, int keyidx) { u8 *iv; size_t len; u8 rc4key[3 + WLAN_KEY_LEN_WEP104]; if (WARN_ON(skb_tailroom(skb) < IEEE80211_WEP_ICV_LEN)) return -1; iv = ieee80211_wep_add_iv(local, skb, keylen, keyidx); if (!iv) return -1; len = skb->len - (iv + IEEE80211_WEP_IV_LEN - skb->data); /* Prepend 24-bit IV to RC4 key */ memcpy(rc4key, iv, 3); /* Copy rest of the WEP key (the secret part) */ memcpy(rc4key + 3, key, keylen); /* Add room for ICV */ skb_put(skb, IEEE80211_WEP_ICV_LEN); return ieee80211_wep_encrypt_data(&local->wep_tx_ctx, rc4key, keylen + 3, iv + IEEE80211_WEP_IV_LEN, len); } /* Perform WEP decryption using given key. data buffer includes encrypted * payload, including 4-byte ICV, but _not_ IV. data_len must not include ICV. * Return 0 on success and -1 on ICV mismatch. */ int ieee80211_wep_decrypt_data(struct arc4_ctx *ctx, u8 *rc4key, size_t klen, u8 *data, size_t data_len) { __le32 crc; arc4_setkey(ctx, rc4key, klen); arc4_crypt(ctx, data, data, data_len + IEEE80211_WEP_ICV_LEN); memzero_explicit(ctx, sizeof(*ctx)); crc = cpu_to_le32(~crc32_le(~0, data, data_len)); if (memcmp(&crc, data + data_len, IEEE80211_WEP_ICV_LEN) != 0) /* ICV mismatch */ return -1; return 0; } /* Perform WEP decryption on given skb. Buffer includes whole WEP part of * the frame: IV (4 bytes), encrypted payload (including SNAP header), * ICV (4 bytes). skb->len includes both IV and ICV. * * Returns 0 if frame was decrypted successfully and ICV was correct and -1 on * failure. If frame is OK, IV and ICV will be removed, i.e., decrypted payload * is moved to the beginning of the skb and skb length will be reduced. */ static int ieee80211_wep_decrypt(struct ieee80211_local *local, struct sk_buff *skb, struct ieee80211_key *key) { u32 klen; u8 rc4key[3 + WLAN_KEY_LEN_WEP104]; u8 keyidx; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; unsigned int hdrlen; size_t len; int ret = 0; if (!ieee80211_has_protected(hdr->frame_control)) return -1; hdrlen = ieee80211_hdrlen(hdr->frame_control); if (skb->len < hdrlen + IEEE80211_WEP_IV_LEN + IEEE80211_WEP_ICV_LEN) return -1; len = skb->len - hdrlen - IEEE80211_WEP_IV_LEN - IEEE80211_WEP_ICV_LEN; keyidx = skb->data[hdrlen + 3] >> 6; if (!key || keyidx != key->conf.keyidx) return -1; klen = 3 + key->conf.keylen; /* Prepend 24-bit IV to RC4 key */ memcpy(rc4key, skb->data + hdrlen, 3); /* Copy rest of the WEP key (the secret part) */ memcpy(rc4key + 3, key->conf.key, key->conf.keylen); if (ieee80211_wep_decrypt_data(&local->wep_rx_ctx, rc4key, klen, skb->data + hdrlen + IEEE80211_WEP_IV_LEN, len)) ret = -1; /* Trim ICV */ skb_trim(skb, skb->len - IEEE80211_WEP_ICV_LEN); /* Remove IV */ memmove(skb->data + IEEE80211_WEP_IV_LEN, skb->data, hdrlen); skb_pull(skb, IEEE80211_WEP_IV_LEN); return ret; } ieee80211_rx_result ieee80211_crypto_wep_decrypt(struct ieee80211_rx_data *rx) { struct sk_buff *skb = rx->skb; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; __le16 fc = hdr->frame_control; if (!ieee80211_is_data(fc) && !ieee80211_is_auth(fc)) return RX_CONTINUE; if (!(status->flag & RX_FLAG_DECRYPTED)) { if (skb_linearize(rx->skb)) return RX_DROP_U_OOM; if (ieee80211_wep_decrypt(rx->local, rx->skb, rx->key)) return RX_DROP_U_WEP_DEC_FAIL; } else if (!(status->flag & RX_FLAG_IV_STRIPPED)) { if (!pskb_may_pull(rx->skb, ieee80211_hdrlen(fc) + IEEE80211_WEP_IV_LEN)) return RX_DROP_U_NO_IV; ieee80211_wep_remove_iv(rx->local, rx->skb, rx->key); /* remove ICV */ if (!(status->flag & RX_FLAG_ICV_STRIPPED) && pskb_trim(rx->skb, rx->skb->len - IEEE80211_WEP_ICV_LEN)) return RX_DROP_U_NO_ICV; } return RX_CONTINUE; } static int wep_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_key_conf *hw_key = info->control.hw_key; if (!hw_key) { if (ieee80211_wep_encrypt(tx->local, skb, tx->key->conf.key, tx->key->conf.keylen, tx->key->conf.keyidx)) return -1; } else if ((hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV) || (hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)) { if (!ieee80211_wep_add_iv(tx->local, skb, tx->key->conf.keylen, tx->key->conf.keyidx)) return -1; } return 0; } ieee80211_tx_result ieee80211_crypto_wep_encrypt(struct ieee80211_tx_data *tx) { struct sk_buff *skb; ieee80211_tx_set_protected(tx); skb_queue_walk(&tx->skbs, skb) { if (wep_encrypt_skb(tx, skb) < 0) { I802_DEBUG_INC(tx->local->tx_handlers_drop_wep); return TX_DROP; } } return TX_CONTINUE; }
11 4 3 1 4 4 15 15 4 11 15 15 15 15 1 2 12 10 2 1 11 10 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 // SPDX-License-Identifier: GPL-2.0 #include <net/xsk_buff_pool.h> #include <net/xdp_sock.h> #include <net/xdp_sock_drv.h> #include "xsk_queue.h" #include "xdp_umem.h" #include "xsk.h" void xp_add_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs) { unsigned long flags; if (!xs->tx) return; spin_lock_irqsave(&pool->xsk_tx_list_lock, flags); list_add_rcu(&xs->tx_list, &pool->xsk_tx_list); spin_unlock_irqrestore(&pool->xsk_tx_list_lock, flags); } void xp_del_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs) { unsigned long flags; if (!xs->tx) return; spin_lock_irqsave(&pool->xsk_tx_list_lock, flags); list_del_rcu(&xs->tx_list); spin_unlock_irqrestore(&pool->xsk_tx_list_lock, flags); } void xp_destroy(struct xsk_buff_pool *pool) { if (!pool) return; kvfree(pool->tx_descs); kvfree(pool->heads); kvfree(pool); } int xp_alloc_tx_descs(struct xsk_buff_pool *pool, struct xdp_sock *xs) { pool->tx_descs = kvcalloc(xs->tx->nentries, sizeof(*pool->tx_descs), GFP_KERNEL); if (!pool->tx_descs) return -ENOMEM; return 0; } struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, struct xdp_umem *umem) { bool unaligned = umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG; struct xsk_buff_pool *pool; struct xdp_buff_xsk *xskb; u32 i, entries; entries = unaligned ? umem->chunks : 0; pool = kvzalloc(struct_size(pool, free_heads, entries), GFP_KERNEL); if (!pool) goto out; pool->heads = kvcalloc(umem->chunks, sizeof(*pool->heads), GFP_KERNEL); if (!pool->heads) goto out; if (xs->tx) if (xp_alloc_tx_descs(pool, xs)) goto out; pool->chunk_mask = ~((u64)umem->chunk_size - 1); pool->addrs_cnt = umem->size; pool->heads_cnt = umem->chunks; pool->free_heads_cnt = umem->chunks; pool->headroom = umem->headroom; pool->chunk_size = umem->chunk_size; pool->chunk_shift = ffs(umem->chunk_size) - 1; pool->unaligned = unaligned; pool->frame_len = umem->chunk_size - umem->headroom - XDP_PACKET_HEADROOM; pool->umem = umem; pool->addrs = umem->addrs; pool->tx_metadata_len = umem->tx_metadata_len; pool->tx_sw_csum = umem->flags & XDP_UMEM_TX_SW_CSUM; INIT_LIST_HEAD(&pool->free_list); INIT_LIST_HEAD(&pool->xskb_list); INIT_LIST_HEAD(&pool->xsk_tx_list); spin_lock_init(&pool->xsk_tx_list_lock); spin_lock_init(&pool->cq_lock); refcount_set(&pool->users, 1); pool->fq = xs->fq_tmp; pool->cq = xs->cq_tmp; for (i = 0; i < pool->free_heads_cnt; i++) { xskb = &pool->heads[i]; xskb->pool = pool; xskb->xdp.frame_sz = umem->chunk_size - umem->headroom; INIT_LIST_HEAD(&xskb->free_list_node); INIT_LIST_HEAD(&xskb->xskb_list_node); if (pool->unaligned) pool->free_heads[i] = xskb; else xp_init_xskb_addr(xskb, pool, i * pool->chunk_size); } return pool; out: xp_destroy(pool); return NULL; } void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) { u32 i; for (i = 0; i < pool->heads_cnt; i++) pool->heads[i].xdp.rxq = rxq; } EXPORT_SYMBOL(xp_set_rxq_info); void xp_fill_cb(struct xsk_buff_pool *pool, struct xsk_cb_desc *desc) { u32 i; for (i = 0; i < pool->heads_cnt; i++) { struct xdp_buff_xsk *xskb = &pool->heads[i]; memcpy(xskb->cb + desc->off, desc->src, desc->bytes); } } EXPORT_SYMBOL(xp_fill_cb); static void xp_disable_drv_zc(struct xsk_buff_pool *pool) { struct netdev_bpf bpf; int err; ASSERT_RTNL(); if (pool->umem->zc) { bpf.command = XDP_SETUP_XSK_POOL; bpf.xsk.pool = NULL; bpf.xsk.queue_id = pool->queue_id; err = pool->netdev->netdev_ops->ndo_bpf(pool->netdev, &bpf); if (err) WARN(1, "Failed to disable zero-copy!\n"); } } #define NETDEV_XDP_ACT_ZC (NETDEV_XDP_ACT_BASIC | \ NETDEV_XDP_ACT_REDIRECT | \ NETDEV_XDP_ACT_XSK_ZEROCOPY) int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *netdev, u16 queue_id, u16 flags) { bool force_zc, force_copy; struct netdev_bpf bpf; int err = 0; ASSERT_RTNL(); force_zc = flags & XDP_ZEROCOPY; force_copy = flags & XDP_COPY; if (force_zc && force_copy) return -EINVAL; if (xsk_get_pool_from_qid(netdev, queue_id)) return -EBUSY; pool->netdev = netdev; pool->queue_id = queue_id; err = xsk_reg_pool_at_qid(netdev, pool, queue_id); if (err) return err; if (flags & XDP_USE_SG) pool->umem->flags |= XDP_UMEM_SG_FLAG; if (flags & XDP_USE_NEED_WAKEUP) pool->uses_need_wakeup = true; /* Tx needs to be explicitly woken up the first time. Also * for supporting drivers that do not implement this * feature. They will always have to call sendto() or poll(). */ pool->cached_need_wakeup = XDP_WAKEUP_TX; dev_hold(netdev); if (force_copy) /* For copy-mode, we are done. */ return 0; if ((netdev->xdp_features & NETDEV_XDP_ACT_ZC) != NETDEV_XDP_ACT_ZC) { err = -EOPNOTSUPP; goto err_unreg_pool; } if (netdev->xdp_zc_max_segs == 1 && (flags & XDP_USE_SG)) { err = -EOPNOTSUPP; goto err_unreg_pool; } bpf.command = XDP_SETUP_XSK_POOL; bpf.xsk.pool = pool; bpf.xsk.queue_id = queue_id; err = netdev->netdev_ops->ndo_bpf(netdev, &bpf); if (err) goto err_unreg_pool; if (!pool->dma_pages) { WARN(1, "Driver did not DMA map zero-copy buffers"); err = -EINVAL; goto err_unreg_xsk; } pool->umem->zc = true; return 0; err_unreg_xsk: xp_disable_drv_zc(pool); err_unreg_pool: if (!force_zc) err = 0; /* fallback to copy mode */ if (err) { xsk_clear_pool_at_qid(netdev, queue_id); dev_put(netdev); } return err; } int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs, struct net_device *dev, u16 queue_id) { u16 flags; struct xdp_umem *umem = umem_xs->umem; /* One fill and completion ring required for each queue id. */ if (!pool->fq || !pool->cq) return -EINVAL; flags = umem->zc ? XDP_ZEROCOPY : XDP_COPY; if (umem_xs->pool->uses_need_wakeup) flags |= XDP_USE_NEED_WAKEUP; return xp_assign_dev(pool, dev, queue_id, flags); } void xp_clear_dev(struct xsk_buff_pool *pool) { if (!pool->netdev) return; xp_disable_drv_zc(pool); xsk_clear_pool_at_qid(pool->netdev, pool->queue_id); dev_put(pool->netdev); pool->netdev = NULL; } static void xp_release_deferred(struct work_struct *work) { struct xsk_buff_pool *pool = container_of(work, struct xsk_buff_pool, work); rtnl_lock(); xp_clear_dev(pool); rtnl_unlock(); if (pool->fq) { xskq_destroy(pool->fq); pool->fq = NULL; } if (pool->cq) { xskq_destroy(pool->cq); pool->cq = NULL; } xdp_put_umem(pool->umem, false); xp_destroy(pool); } void xp_get_pool(struct xsk_buff_pool *pool) { refcount_inc(&pool->users); } bool xp_put_pool(struct xsk_buff_pool *pool) { if (!pool) return false; if (refcount_dec_and_test(&pool->users)) { INIT_WORK(&pool->work, xp_release_deferred); schedule_work(&pool->work); return true; } return false; } static struct xsk_dma_map *xp_find_dma_map(struct xsk_buff_pool *pool) { struct xsk_dma_map *dma_map; list_for_each_entry(dma_map, &pool->umem->xsk_dma_list, list) { if (dma_map->netdev == pool->netdev) return dma_map; } return NULL; } static struct xsk_dma_map *xp_create_dma_map(struct device *dev, struct net_device *netdev, u32 nr_pages, struct xdp_umem *umem) { struct xsk_dma_map *dma_map; dma_map = kzalloc(sizeof(*dma_map), GFP_KERNEL); if (!dma_map) return NULL; dma_map->dma_pages = kvcalloc(nr_pages, sizeof(*dma_map->dma_pages), GFP_KERNEL); if (!dma_map->dma_pages) { kfree(dma_map); return NULL; } dma_map->netdev = netdev; dma_map->dev = dev; dma_map->dma_need_sync = false; dma_map->dma_pages_cnt = nr_pages; refcount_set(&dma_map->users, 1); list_add(&dma_map->list, &umem->xsk_dma_list); return dma_map; } static void xp_destroy_dma_map(struct xsk_dma_map *dma_map) { list_del(&dma_map->list); kvfree(dma_map->dma_pages); kfree(dma_map); } static void __xp_dma_unmap(struct xsk_dma_map *dma_map, unsigned long attrs) { dma_addr_t *dma; u32 i; for (i = 0; i < dma_map->dma_pages_cnt; i++) { dma = &dma_map->dma_pages[i]; if (*dma) { *dma &= ~XSK_NEXT_PG_CONTIG_MASK; dma_unmap_page_attrs(dma_map->dev, *dma, PAGE_SIZE, DMA_BIDIRECTIONAL, attrs); *dma = 0; } } xp_destroy_dma_map(dma_map); } void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) { struct xsk_dma_map *dma_map; if (!pool->dma_pages) return; dma_map = xp_find_dma_map(pool); if (!dma_map) { WARN(1, "Could not find dma_map for device"); return; } if (!refcount_dec_and_test(&dma_map->users)) return; __xp_dma_unmap(dma_map, attrs); kvfree(pool->dma_pages); pool->dma_pages = NULL; pool->dma_pages_cnt = 0; pool->dev = NULL; } EXPORT_SYMBOL(xp_dma_unmap); static void xp_check_dma_contiguity(struct xsk_dma_map *dma_map) { u32 i; for (i = 0; i < dma_map->dma_pages_cnt - 1; i++) { if (dma_map->dma_pages[i] + PAGE_SIZE == dma_map->dma_pages[i + 1]) dma_map->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK; else dma_map->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK; } } static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_map) { if (!pool->unaligned) { u32 i; for (i = 0; i < pool->heads_cnt; i++) { struct xdp_buff_xsk *xskb = &pool->heads[i]; xp_init_xskb_dma(xskb, pool, dma_map->dma_pages, xskb->orig_addr); } } pool->dma_pages = kvcalloc(dma_map->dma_pages_cnt, sizeof(*pool->dma_pages), GFP_KERNEL); if (!pool->dma_pages) return -ENOMEM; pool->dev = dma_map->dev; pool->dma_pages_cnt = dma_map->dma_pages_cnt; pool->dma_need_sync = dma_map->dma_need_sync; memcpy(pool->dma_pages, dma_map->dma_pages, pool->dma_pages_cnt * sizeof(*pool->dma_pages)); return 0; } int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, unsigned long attrs, struct page **pages, u32 nr_pages) { struct xsk_dma_map *dma_map; dma_addr_t dma; int err; u32 i; dma_map = xp_find_dma_map(pool); if (dma_map) { err = xp_init_dma_info(pool, dma_map); if (err) return err; refcount_inc(&dma_map->users); return 0; } dma_map = xp_create_dma_map(dev, pool->netdev, nr_pages, pool->umem); if (!dma_map) return -ENOMEM; for (i = 0; i < dma_map->dma_pages_cnt; i++) { dma = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE, DMA_BIDIRECTIONAL, attrs); if (dma_mapping_error(dev, dma)) { __xp_dma_unmap(dma_map, attrs); return -ENOMEM; } if (dma_need_sync(dev, dma)) dma_map->dma_need_sync = true; dma_map->dma_pages[i] = dma; } if (pool->unaligned) xp_check_dma_contiguity(dma_map); err = xp_init_dma_info(pool, dma_map); if (err) { __xp_dma_unmap(dma_map, attrs); return err; } return 0; } EXPORT_SYMBOL(xp_dma_map); static bool xp_addr_crosses_non_contig_pg(struct xsk_buff_pool *pool, u64 addr) { return xp_desc_crosses_non_contig_pg(pool, addr, pool->chunk_size); } static bool xp_check_unaligned(struct xsk_buff_pool *pool, u64 *addr) { *addr = xp_unaligned_extract_addr(*addr); if (*addr >= pool->addrs_cnt || *addr + pool->chunk_size > pool->addrs_cnt || xp_addr_crosses_non_contig_pg(pool, *addr)) return false; return true; } static bool xp_check_aligned(struct xsk_buff_pool *pool, u64 *addr) { *addr = xp_aligned_extract_addr(pool, *addr); return *addr < pool->addrs_cnt; } static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) { struct xdp_buff_xsk *xskb; u64 addr; bool ok; if (pool->free_heads_cnt == 0) return NULL; for (;;) { if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) { pool->fq->queue_empty_descs++; return NULL; } ok = pool->unaligned ? xp_check_unaligned(pool, &addr) : xp_check_aligned(pool, &addr); if (!ok) { pool->fq->invalid_descs++; xskq_cons_release(pool->fq); continue; } break; } if (pool->unaligned) { xskb = pool->free_heads[--pool->free_heads_cnt]; xp_init_xskb_addr(xskb, pool, addr); if (pool->dma_pages) xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr); } else { xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)]; } xskq_cons_release(pool->fq); return xskb; } struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool) { struct xdp_buff_xsk *xskb; if (!pool->free_list_cnt) { xskb = __xp_alloc(pool); if (!xskb) return NULL; } else { pool->free_list_cnt--; xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, free_list_node); list_del_init(&xskb->free_list_node); } xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM; xskb->xdp.data_meta = xskb->xdp.data; if (pool->dma_need_sync) { dma_sync_single_range_for_device(pool->dev, xskb->dma, 0, pool->frame_len, DMA_BIDIRECTIONAL); } return &xskb->xdp; } EXPORT_SYMBOL(xp_alloc); static u32 xp_alloc_new_from_fq(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) { u32 i, cached_cons, nb_entries; if (max > pool->free_heads_cnt) max = pool->free_heads_cnt; max = xskq_cons_nb_entries(pool->fq, max); cached_cons = pool->fq->cached_cons; nb_entries = max; i = max; while (i--) { struct xdp_buff_xsk *xskb; u64 addr; bool ok; __xskq_cons_read_addr_unchecked(pool->fq, cached_cons++, &addr); ok = pool->unaligned ? xp_check_unaligned(pool, &addr) : xp_check_aligned(pool, &addr); if (unlikely(!ok)) { pool->fq->invalid_descs++; nb_entries--; continue; } if (pool->unaligned) { xskb = pool->free_heads[--pool->free_heads_cnt]; xp_init_xskb_addr(xskb, pool, addr); if (pool->dma_pages) xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr); } else { xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)]; } *xdp = &xskb->xdp; xdp++; } xskq_cons_release_n(pool->fq, max); return nb_entries; } static u32 xp_alloc_reused(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 nb_entries) { struct xdp_buff_xsk *xskb; u32 i; nb_entries = min_t(u32, nb_entries, pool->free_list_cnt); i = nb_entries; while (i--) { xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, free_list_node); list_del_init(&xskb->free_list_node); *xdp = &xskb->xdp; xdp++; } pool->free_list_cnt -= nb_entries; return nb_entries; } u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) { u32 nb_entries1 = 0, nb_entries2; if (unlikely(pool->dma_need_sync)) { struct xdp_buff *buff; /* Slow path */ buff = xp_alloc(pool); if (buff) *xdp = buff; return !!buff; } if (unlikely(pool->free_list_cnt)) { nb_entries1 = xp_alloc_reused(pool, xdp, max); if (nb_entries1 == max) return nb_entries1; max -= nb_entries1; xdp += nb_entries1; } nb_entries2 = xp_alloc_new_from_fq(pool, xdp, max); if (!nb_entries2) pool->fq->queue_empty_descs++; return nb_entries1 + nb_entries2; } EXPORT_SYMBOL(xp_alloc_batch); bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count) { if (pool->free_list_cnt >= count) return true; return xskq_cons_has_entries(pool->fq, count - pool->free_list_cnt); } EXPORT_SYMBOL(xp_can_alloc); void xp_free(struct xdp_buff_xsk *xskb) { if (!list_empty(&xskb->free_list_node)) return; xskb->pool->free_list_cnt++; list_add(&xskb->free_list_node, &xskb->pool->free_list); } EXPORT_SYMBOL(xp_free); void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr) { addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; return pool->addrs + addr; } EXPORT_SYMBOL(xp_raw_get_data); dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) { addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; return (pool->dma_pages[addr >> PAGE_SHIFT] & ~XSK_NEXT_PG_CONTIG_MASK) + (addr & ~PAGE_MASK); } EXPORT_SYMBOL(xp_raw_get_dma); void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb) { dma_sync_single_range_for_cpu(xskb->pool->dev, xskb->dma, 0, xskb->pool->frame_len, DMA_BIDIRECTIONAL); } EXPORT_SYMBOL(xp_dma_sync_for_cpu_slow); void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma, size_t size) { dma_sync_single_range_for_device(pool->dev, dma, 0, size, DMA_BIDIRECTIONAL); } EXPORT_SYMBOL(xp_dma_sync_for_device_slow);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_PKT_SCHED_H #define __NET_PKT_SCHED_H #include <linux/jiffies.h> #include <linux/ktime.h> #include <linux/if_vlan.h> #include <linux/netdevice.h> #include <net/sch_generic.h> #include <net/net_namespace.h> #include <uapi/linux/pkt_sched.h> #define DEFAULT_TX_QUEUE_LEN 1000 #define STAB_SIZE_LOG_MAX 30 struct qdisc_walker { int stop; int skip; int count; int (*fn)(struct Qdisc *, unsigned long cl, struct qdisc_walker *); }; #define qdisc_priv(q) \ _Generic(q, \ const struct Qdisc * : (const void *)&q->privdata, \ struct Qdisc * : (void *)&q->privdata) static inline struct Qdisc *qdisc_from_priv(void *priv) { return container_of(priv, struct Qdisc, privdata); } /* Timer resolution MUST BE < 10% of min_schedulable_packet_size/bandwidth Normal IP packet size ~ 512byte, hence: 0.5Kbyte/1Mbyte/sec = 0.5msec, so that we need 50usec timer for 10Mbit ethernet. 10msec resolution -> <50Kbit/sec. The result: [34]86 is not good choice for QoS router :-( The things are not so bad, because we may use artificial clock evaluated by integration of network data flow in the most critical places. */ typedef u64 psched_time_t; typedef long psched_tdiff_t; /* Avoid doing 64 bit divide */ #define PSCHED_SHIFT 6 #define PSCHED_TICKS2NS(x) ((s64)(x) << PSCHED_SHIFT) #define PSCHED_NS2TICKS(x) ((x) >> PSCHED_SHIFT) #define PSCHED_TICKS_PER_SEC PSCHED_NS2TICKS(NSEC_PER_SEC) #define PSCHED_PASTPERFECT 0 static inline psched_time_t psched_get_time(void) { return PSCHED_NS2TICKS(ktime_get_ns()); } struct qdisc_watchdog { struct hrtimer timer; struct Qdisc *qdisc; }; void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, clockid_t clockid); void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc); void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires, u64 delta_ns); static inline void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) { return qdisc_watchdog_schedule_range_ns(wd, expires, 0ULL); } static inline void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires) { qdisc_watchdog_schedule_ns(wd, PSCHED_TICKS2NS(expires)); } void qdisc_watchdog_cancel(struct qdisc_watchdog *wd); extern struct Qdisc_ops pfifo_qdisc_ops; extern struct Qdisc_ops bfifo_qdisc_ops; extern struct Qdisc_ops pfifo_head_drop_qdisc_ops; int fifo_set_limit(struct Qdisc *q, unsigned int limit); struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops, unsigned int limit, struct netlink_ext_ack *extack); int register_qdisc(struct Qdisc_ops *qops); void unregister_qdisc(struct Qdisc_ops *qops); void qdisc_get_default(char *id, size_t len); int qdisc_set_default(const char *id); void qdisc_hash_add(struct Qdisc *q, bool invisible); void qdisc_hash_del(struct Qdisc *q); struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle); struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle); struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab, struct netlink_ext_ack *extack); void qdisc_put_rtab(struct qdisc_rate_table *tab); void qdisc_put_stab(struct qdisc_size_table *tab); void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc); bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq, spinlock_t *root_lock, bool validate); void __qdisc_run(struct Qdisc *q); static inline void qdisc_run(struct Qdisc *q) { if (qdisc_run_begin(q)) { __qdisc_run(q); qdisc_run_end(q); } } extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; /* Calculate maximal size of packet seen by hard_start_xmit routine of this device. */ static inline unsigned int psched_mtu(const struct net_device *dev) { return READ_ONCE(dev->mtu) + dev->hard_header_len; } static inline struct net *qdisc_net(struct Qdisc *q) { return dev_net(q->dev_queue->dev); } struct tc_query_caps_base { enum tc_setup_type type; void *caps; }; struct tc_cbs_qopt_offload { u8 enable; s32 queue; s32 hicredit; s32 locredit; s32 idleslope; s32 sendslope; }; struct tc_etf_qopt_offload { u8 enable; s32 queue; }; struct tc_mqprio_caps { bool validate_queue_counts:1; }; struct tc_mqprio_qopt_offload { /* struct tc_mqprio_qopt must always be the first element */ struct tc_mqprio_qopt qopt; struct netlink_ext_ack *extack; u16 mode; u16 shaper; u32 flags; u64 min_rate[TC_QOPT_MAX_QUEUE]; u64 max_rate[TC_QOPT_MAX_QUEUE]; unsigned long preemptible_tcs; }; struct tc_taprio_caps { bool supports_queue_max_sdu:1; bool gate_mask_per_txq:1; /* Device expects lower TXQ numbers to have higher priority over higher * TXQs, regardless of their TC mapping. DO NOT USE FOR NEW DRIVERS, * INSTEAD ENFORCE A PROPER TC:TXQ MAPPING COMING FROM USER SPACE. */ bool broken_mqprio:1; }; enum tc_taprio_qopt_cmd { TAPRIO_CMD_REPLACE, TAPRIO_CMD_DESTROY, TAPRIO_CMD_STATS, TAPRIO_CMD_QUEUE_STATS, }; /** * struct tc_taprio_qopt_stats - IEEE 802.1Qbv statistics * @window_drops: Frames that were dropped because they were too large to be * transmitted in any of the allotted time windows (open gates) for their * traffic class. * @tx_overruns: Frames still being transmitted by the MAC after the * transmission gate associated with their traffic class has closed. * Equivalent to `12.29.1.1.2 TransmissionOverrun` from 802.1Q-2018. */ struct tc_taprio_qopt_stats { u64 window_drops; u64 tx_overruns; }; struct tc_taprio_qopt_queue_stats { int queue; struct tc_taprio_qopt_stats stats; }; struct tc_taprio_sched_entry { u8 command; /* TC_TAPRIO_CMD_* */ /* The gate_mask in the offloading side refers to traffic classes */ u32 gate_mask; u32 interval; }; struct tc_taprio_qopt_offload { enum tc_taprio_qopt_cmd cmd; union { /* TAPRIO_CMD_STATS */ struct tc_taprio_qopt_stats stats; /* TAPRIO_CMD_QUEUE_STATS */ struct tc_taprio_qopt_queue_stats queue_stats; /* TAPRIO_CMD_REPLACE */ struct { struct tc_mqprio_qopt_offload mqprio; struct netlink_ext_ack *extack; ktime_t base_time; u64 cycle_time; u64 cycle_time_extension; u32 max_sdu[TC_MAX_QUEUE]; size_t num_entries; struct tc_taprio_sched_entry entries[]; }; }; }; #if IS_ENABLED(CONFIG_NET_SCH_TAPRIO) /* Reference counting */ struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload *offload); void taprio_offload_free(struct tc_taprio_qopt_offload *offload); #else /* Reference counting */ static inline struct tc_taprio_qopt_offload * taprio_offload_get(struct tc_taprio_qopt_offload *offload) { return NULL; } static inline void taprio_offload_free(struct tc_taprio_qopt_offload *offload) { } #endif /* Ensure skb_mstamp_ns, which might have been populated with the txtime, is * not mistaken for a software timestamp, because this will otherwise prevent * the dispatch of hardware timestamps to the socket. */ static inline void skb_txtime_consumed(struct sk_buff *skb) { skb->tstamp = ktime_set(0, 0); } static inline bool tc_qdisc_stats_dump(struct Qdisc *sch, unsigned long cl, struct qdisc_walker *arg) { if (arg->count >= arg->skip && arg->fn(sch, cl, arg) < 0) { arg->stop = 1; return false; } arg->count++; return true; } #endif
1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 // SPDX-License-Identifier: GPL-2.0-only /* * net/sched/sch_ets.c Enhanced Transmission Selection scheduler * * Description * ----------- * * The Enhanced Transmission Selection scheduler is a classful queuing * discipline that merges functionality of PRIO and DRR qdiscs in one scheduler. * ETS makes it easy to configure a set of strict and bandwidth-sharing bands to * implement the transmission selection described in 802.1Qaz. * * Although ETS is technically classful, it's not possible to add and remove * classes at will. Instead one specifies number of classes, how many are * PRIO-like and how many DRR-like, and quanta for the latter. * * Algorithm * --------- * * The strict classes, if any, are tried for traffic first: first band 0, if it * has no traffic then band 1, etc. * * When there is no traffic in any of the strict queues, the bandwidth-sharing * ones are tried next. Each band is assigned a deficit counter, initialized to * "quantum" of that band. ETS maintains a list of active bandwidth-sharing * bands whose qdiscs are non-empty. A packet is dequeued from the band at the * head of the list if the packet size is smaller or equal to the deficit * counter. If the counter is too small, it is increased by "quantum" and the * scheduler moves on to the next band in the active list. */ #include <linux/module.h> #include <net/gen_stats.h> #include <net/netlink.h> #include <net/pkt_cls.h> #include <net/pkt_sched.h> #include <net/sch_generic.h> struct ets_class { struct list_head alist; /* In struct ets_sched.active. */ struct Qdisc *qdisc; u32 quantum; u32 deficit; struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; }; struct ets_sched { struct list_head active; struct tcf_proto __rcu *filter_list; struct tcf_block *block; unsigned int nbands; unsigned int nstrict; u8 prio2band[TC_PRIO_MAX + 1]; struct ets_class classes[TCQ_ETS_MAX_BANDS]; }; static const struct nla_policy ets_policy[TCA_ETS_MAX + 1] = { [TCA_ETS_NBANDS] = { .type = NLA_U8 }, [TCA_ETS_NSTRICT] = { .type = NLA_U8 }, [TCA_ETS_QUANTA] = { .type = NLA_NESTED }, [TCA_ETS_PRIOMAP] = { .type = NLA_NESTED }, }; static const struct nla_policy ets_priomap_policy[TCA_ETS_MAX + 1] = { [TCA_ETS_PRIOMAP_BAND] = { .type = NLA_U8 }, }; static const struct nla_policy ets_quanta_policy[TCA_ETS_MAX + 1] = { [TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 }, }; static const struct nla_policy ets_class_policy[TCA_ETS_MAX + 1] = { [TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 }, }; static int ets_quantum_parse(struct Qdisc *sch, const struct nlattr *attr, unsigned int *quantum, struct netlink_ext_ack *extack) { *quantum = nla_get_u32(attr); if (!*quantum) { NL_SET_ERR_MSG(extack, "ETS quantum cannot be zero"); return -EINVAL; } return 0; } static struct ets_class * ets_class_from_arg(struct Qdisc *sch, unsigned long arg) { struct ets_sched *q = qdisc_priv(sch); return &q->classes[arg - 1]; } static u32 ets_class_id(struct Qdisc *sch, const struct ets_class *cl) { struct ets_sched *q = qdisc_priv(sch); int band = cl - q->classes; return TC_H_MAKE(sch->handle, band + 1); } static void ets_offload_change(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct ets_sched *q = qdisc_priv(sch); struct tc_ets_qopt_offload qopt; unsigned int w_psum_prev = 0; unsigned int q_psum = 0; unsigned int q_sum = 0; unsigned int quantum; unsigned int w_psum; unsigned int weight; unsigned int i; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return; qopt.command = TC_ETS_REPLACE; qopt.handle = sch->handle; qopt.parent = sch->parent; qopt.replace_params.bands = q->nbands; qopt.replace_params.qstats = &sch->qstats; memcpy(&qopt.replace_params.priomap, q->prio2band, sizeof(q->prio2band)); for (i = 0; i < q->nbands; i++) q_sum += q->classes[i].quantum; for (i = 0; i < q->nbands; i++) { quantum = q->classes[i].quantum; q_psum += quantum; w_psum = quantum ? q_psum * 100 / q_sum : 0; weight = w_psum - w_psum_prev; w_psum_prev = w_psum; qopt.replace_params.quanta[i] = quantum; qopt.replace_params.weights[i] = weight; } dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETS, &qopt); } static void ets_offload_destroy(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct tc_ets_qopt_offload qopt; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return; qopt.command = TC_ETS_DESTROY; qopt.handle = sch->handle; qopt.parent = sch->parent; dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETS, &qopt); } static void ets_offload_graft(struct Qdisc *sch, struct Qdisc *new, struct Qdisc *old, unsigned long arg, struct netlink_ext_ack *extack) { struct net_device *dev = qdisc_dev(sch); struct tc_ets_qopt_offload qopt; qopt.command = TC_ETS_GRAFT; qopt.handle = sch->handle; qopt.parent = sch->parent; qopt.graft_params.band = arg - 1; qopt.graft_params.child_handle = new->handle; qdisc_offload_graft_helper(dev, sch, new, old, TC_SETUP_QDISC_ETS, &qopt, extack); } static int ets_offload_dump(struct Qdisc *sch) { struct tc_ets_qopt_offload qopt; qopt.command = TC_ETS_STATS; qopt.handle = sch->handle; qopt.parent = sch->parent; qopt.stats.bstats = &sch->bstats; qopt.stats.qstats = &sch->qstats; return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_ETS, &qopt); } static bool ets_class_is_strict(struct ets_sched *q, const struct ets_class *cl) { unsigned int band = cl - q->classes; return band < q->nstrict; } static int ets_class_change(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, unsigned long *arg, struct netlink_ext_ack *extack) { struct ets_class *cl = ets_class_from_arg(sch, *arg); struct ets_sched *q = qdisc_priv(sch); struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_ETS_MAX + 1]; unsigned int quantum; int err; /* Classes can be added and removed only through Qdisc_ops.change * interface. */ if (!cl) { NL_SET_ERR_MSG(extack, "Fine-grained class addition and removal is not supported"); return -EOPNOTSUPP; } if (!opt) { NL_SET_ERR_MSG(extack, "ETS options are required for this operation"); return -EINVAL; } err = nla_parse_nested(tb, TCA_ETS_MAX, opt, ets_class_policy, extack); if (err < 0) return err; if (!tb[TCA_ETS_QUANTA_BAND]) /* Nothing to configure. */ return 0; if (ets_class_is_strict(q, cl)) { NL_SET_ERR_MSG(extack, "Strict bands do not have a configurable quantum"); return -EINVAL; } err = ets_quantum_parse(sch, tb[TCA_ETS_QUANTA_BAND], &quantum, extack); if (err) return err; sch_tree_lock(sch); cl->quantum = quantum; sch_tree_unlock(sch); ets_offload_change(sch); return 0; } static int ets_class_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct ets_class *cl = ets_class_from_arg(sch, arg); if (!new) { new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, ets_class_id(sch, cl), NULL); if (!new) new = &noop_qdisc; else qdisc_hash_add(new, true); } *old = qdisc_replace(sch, new, &cl->qdisc); ets_offload_graft(sch, new, *old, arg, extack); return 0; } static struct Qdisc *ets_class_leaf(struct Qdisc *sch, unsigned long arg) { struct ets_class *cl = ets_class_from_arg(sch, arg); return cl->qdisc; } static unsigned long ets_class_find(struct Qdisc *sch, u32 classid) { unsigned long band = TC_H_MIN(classid); struct ets_sched *q = qdisc_priv(sch); if (band - 1 >= q->nbands) return 0; return band; } static void ets_class_qlen_notify(struct Qdisc *sch, unsigned long arg) { struct ets_class *cl = ets_class_from_arg(sch, arg); struct ets_sched *q = qdisc_priv(sch); /* We get notified about zero-length child Qdiscs as well if they are * offloaded. Those aren't on the active list though, so don't attempt * to remove them. */ if (!ets_class_is_strict(q, cl) && sch->q.qlen) list_del(&cl->alist); } static int ets_class_dump(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, struct tcmsg *tcm) { struct ets_class *cl = ets_class_from_arg(sch, arg); struct ets_sched *q = qdisc_priv(sch); struct nlattr *nest; tcm->tcm_parent = TC_H_ROOT; tcm->tcm_handle = ets_class_id(sch, cl); tcm->tcm_info = cl->qdisc->handle; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!nest) goto nla_put_failure; if (!ets_class_is_strict(q, cl)) { if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND, cl->quantum)) goto nla_put_failure; } return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int ets_class_dump_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) { struct ets_class *cl = ets_class_from_arg(sch, arg); struct Qdisc *cl_q = cl->qdisc; if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats, true) < 0 || qdisc_qstats_copy(d, cl_q) < 0) return -1; return 0; } static void ets_qdisc_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct ets_sched *q = qdisc_priv(sch); int i; if (arg->stop) return; for (i = 0; i < q->nbands; i++) { if (!tc_qdisc_stats_dump(sch, i + 1, arg)) break; } } static struct tcf_block * ets_qdisc_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { struct ets_sched *q = qdisc_priv(sch); if (cl) { NL_SET_ERR_MSG(extack, "ETS classid must be zero"); return NULL; } return q->block; } static unsigned long ets_qdisc_bind_tcf(struct Qdisc *sch, unsigned long parent, u32 classid) { return ets_class_find(sch, classid); } static void ets_qdisc_unbind_tcf(struct Qdisc *sch, unsigned long arg) { } static struct ets_class *ets_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { struct ets_sched *q = qdisc_priv(sch); u32 band = skb->priority; struct tcf_result res; struct tcf_proto *fl; int err; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; if (TC_H_MAJ(skb->priority) != sch->handle) { fl = rcu_dereference_bh(q->filter_list); err = tcf_classify(skb, NULL, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; fallthrough; case TC_ACT_SHOT: return NULL; } #endif if (!fl || err < 0) { if (TC_H_MAJ(band)) band = 0; return &q->classes[q->prio2band[band & TC_PRIO_MAX]]; } band = res.classid; } band = TC_H_MIN(band) - 1; if (band >= q->nbands) return &q->classes[q->prio2band[0]]; return &q->classes[band]; } static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { unsigned int len = qdisc_pkt_len(skb); struct ets_sched *q = qdisc_priv(sch); struct ets_class *cl; int err = 0; bool first; cl = ets_classify(skb, sch, &err); if (!cl) { if (err & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); return err; } first = !cl->qdisc->q.qlen; err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { cl->qstats.drops++; qdisc_qstats_drop(sch); } return err; } if (first && !ets_class_is_strict(q, cl)) { list_add_tail(&cl->alist, &q->active); cl->deficit = cl->quantum; } sch->qstats.backlog += len; sch->q.qlen++; return err; } static struct sk_buff * ets_qdisc_dequeue_skb(struct Qdisc *sch, struct sk_buff *skb) { qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; } static struct sk_buff *ets_qdisc_dequeue(struct Qdisc *sch) { struct ets_sched *q = qdisc_priv(sch); struct ets_class *cl; struct sk_buff *skb; unsigned int band; unsigned int len; while (1) { for (band = 0; band < q->nstrict; band++) { cl = &q->classes[band]; skb = qdisc_dequeue_peeked(cl->qdisc); if (skb) return ets_qdisc_dequeue_skb(sch, skb); } if (list_empty(&q->active)) goto out; cl = list_first_entry(&q->active, struct ets_class, alist); skb = cl->qdisc->ops->peek(cl->qdisc); if (!skb) { qdisc_warn_nonwc(__func__, cl->qdisc); goto out; } len = qdisc_pkt_len(skb); if (len <= cl->deficit) { cl->deficit -= len; skb = qdisc_dequeue_peeked(cl->qdisc); if (unlikely(!skb)) goto out; if (cl->qdisc->q.qlen == 0) list_del(&cl->alist); return ets_qdisc_dequeue_skb(sch, skb); } cl->deficit += cl->quantum; list_move_tail(&cl->alist, &q->active); } out: return NULL; } static int ets_qdisc_priomap_parse(struct nlattr *priomap_attr, unsigned int nbands, u8 *priomap, struct netlink_ext_ack *extack) { const struct nlattr *attr; int prio = 0; u8 band; int rem; int err; err = __nla_validate_nested(priomap_attr, TCA_ETS_MAX, ets_priomap_policy, NL_VALIDATE_STRICT, extack); if (err) return err; nla_for_each_nested(attr, priomap_attr, rem) { switch (nla_type(attr)) { case TCA_ETS_PRIOMAP_BAND: if (prio > TC_PRIO_MAX) { NL_SET_ERR_MSG_MOD(extack, "Too many priorities in ETS priomap"); return -EINVAL; } band = nla_get_u8(attr); if (band >= nbands) { NL_SET_ERR_MSG_MOD(extack, "Invalid band number in ETS priomap"); return -EINVAL; } priomap[prio++] = band; break; default: WARN_ON_ONCE(1); /* Validate should have caught this. */ return -EINVAL; } } return 0; } static int ets_qdisc_quanta_parse(struct Qdisc *sch, struct nlattr *quanta_attr, unsigned int nbands, unsigned int nstrict, unsigned int *quanta, struct netlink_ext_ack *extack) { const struct nlattr *attr; int band = nstrict; int rem; int err; err = __nla_validate_nested(quanta_attr, TCA_ETS_MAX, ets_quanta_policy, NL_VALIDATE_STRICT, extack); if (err < 0) return err; nla_for_each_nested(attr, quanta_attr, rem) { switch (nla_type(attr)) { case TCA_ETS_QUANTA_BAND: if (band >= nbands) { NL_SET_ERR_MSG_MOD(extack, "ETS quanta has more values than bands"); return -EINVAL; } err = ets_quantum_parse(sch, attr, &quanta[band++], extack); if (err) return err; break; default: WARN_ON_ONCE(1); /* Validate should have caught this. */ return -EINVAL; } } return 0; } static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { unsigned int quanta[TCQ_ETS_MAX_BANDS] = {0}; struct Qdisc *queues[TCQ_ETS_MAX_BANDS]; struct ets_sched *q = qdisc_priv(sch); struct nlattr *tb[TCA_ETS_MAX + 1]; unsigned int oldbands = q->nbands; u8 priomap[TC_PRIO_MAX + 1]; unsigned int nstrict = 0; unsigned int nbands; unsigned int i; int err; err = nla_parse_nested(tb, TCA_ETS_MAX, opt, ets_policy, extack); if (err < 0) return err; if (!tb[TCA_ETS_NBANDS]) { NL_SET_ERR_MSG_MOD(extack, "Number of bands is a required argument"); return -EINVAL; } nbands = nla_get_u8(tb[TCA_ETS_NBANDS]); if (nbands < 1 || nbands > TCQ_ETS_MAX_BANDS) { NL_SET_ERR_MSG_MOD(extack, "Invalid number of bands"); return -EINVAL; } /* Unless overridden, traffic goes to the last band. */ memset(priomap, nbands - 1, sizeof(priomap)); if (tb[TCA_ETS_NSTRICT]) { nstrict = nla_get_u8(tb[TCA_ETS_NSTRICT]); if (nstrict > nbands) { NL_SET_ERR_MSG_MOD(extack, "Invalid number of strict bands"); return -EINVAL; } } if (tb[TCA_ETS_PRIOMAP]) { err = ets_qdisc_priomap_parse(tb[TCA_ETS_PRIOMAP], nbands, priomap, extack); if (err) return err; } if (tb[TCA_ETS_QUANTA]) { err = ets_qdisc_quanta_parse(sch, tb[TCA_ETS_QUANTA], nbands, nstrict, quanta, extack); if (err) return err; } /* If there are more bands than strict + quanta provided, the remaining * ones are ETS with quantum of MTU. Initialize the missing values here. */ for (i = nstrict; i < nbands; i++) { if (!quanta[i]) quanta[i] = psched_mtu(qdisc_dev(sch)); } /* Before commit, make sure we can allocate all new qdiscs */ for (i = oldbands; i < nbands; i++) { queues[i] = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, ets_class_id(sch, &q->classes[i]), extack); if (!queues[i]) { while (i > oldbands) qdisc_put(queues[--i]); return -ENOMEM; } } sch_tree_lock(sch); q->nbands = nbands; for (i = nstrict; i < q->nstrict; i++) { if (q->classes[i].qdisc->q.qlen) { list_add_tail(&q->classes[i].alist, &q->active); q->classes[i].deficit = quanta[i]; } } for (i = q->nbands; i < oldbands; i++) { if (i >= q->nstrict && q->classes[i].qdisc->q.qlen) list_del(&q->classes[i].alist); qdisc_tree_flush_backlog(q->classes[i].qdisc); } q->nstrict = nstrict; memcpy(q->prio2band, priomap, sizeof(priomap)); for (i = 0; i < q->nbands; i++) q->classes[i].quantum = quanta[i]; for (i = oldbands; i < q->nbands; i++) { q->classes[i].qdisc = queues[i]; if (q->classes[i].qdisc != &noop_qdisc) qdisc_hash_add(q->classes[i].qdisc, true); } sch_tree_unlock(sch); ets_offload_change(sch); for (i = q->nbands; i < oldbands; i++) { qdisc_put(q->classes[i].qdisc); q->classes[i].qdisc = NULL; q->classes[i].quantum = 0; q->classes[i].deficit = 0; gnet_stats_basic_sync_init(&q->classes[i].bstats); memset(&q->classes[i].qstats, 0, sizeof(q->classes[i].qstats)); } return 0; } static int ets_qdisc_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct ets_sched *q = qdisc_priv(sch); int err, i; if (!opt) return -EINVAL; err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; INIT_LIST_HEAD(&q->active); for (i = 0; i < TCQ_ETS_MAX_BANDS; i++) INIT_LIST_HEAD(&q->classes[i].alist); return ets_qdisc_change(sch, opt, extack); } static void ets_qdisc_reset(struct Qdisc *sch) { struct ets_sched *q = qdisc_priv(sch); int band; for (band = q->nstrict; band < q->nbands; band++) { if (q->classes[band].qdisc->q.qlen) list_del(&q->classes[band].alist); } for (band = 0; band < q->nbands; band++) qdisc_reset(q->classes[band].qdisc); } static void ets_qdisc_destroy(struct Qdisc *sch) { struct ets_sched *q = qdisc_priv(sch); int band; ets_offload_destroy(sch); tcf_block_put(q->block); for (band = 0; band < q->nbands; band++) qdisc_put(q->classes[band].qdisc); } static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb) { struct ets_sched *q = qdisc_priv(sch); struct nlattr *opts; struct nlattr *nest; int band; int prio; int err; err = ets_offload_dump(sch); if (err) return err; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!opts) goto nla_err; if (nla_put_u8(skb, TCA_ETS_NBANDS, q->nbands)) goto nla_err; if (q->nstrict && nla_put_u8(skb, TCA_ETS_NSTRICT, q->nstrict)) goto nla_err; if (q->nbands > q->nstrict) { nest = nla_nest_start(skb, TCA_ETS_QUANTA); if (!nest) goto nla_err; for (band = q->nstrict; band < q->nbands; band++) { if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND, q->classes[band].quantum)) goto nla_err; } nla_nest_end(skb, nest); } nest = nla_nest_start(skb, TCA_ETS_PRIOMAP); if (!nest) goto nla_err; for (prio = 0; prio <= TC_PRIO_MAX; prio++) { if (nla_put_u8(skb, TCA_ETS_PRIOMAP_BAND, q->prio2band[prio])) goto nla_err; } nla_nest_end(skb, nest); return nla_nest_end(skb, opts); nla_err: nla_nest_cancel(skb, opts); return -EMSGSIZE; } static const struct Qdisc_class_ops ets_class_ops = { .change = ets_class_change, .graft = ets_class_graft, .leaf = ets_class_leaf, .find = ets_class_find, .qlen_notify = ets_class_qlen_notify, .dump = ets_class_dump, .dump_stats = ets_class_dump_stats, .walk = ets_qdisc_walk, .tcf_block = ets_qdisc_tcf_block, .bind_tcf = ets_qdisc_bind_tcf, .unbind_tcf = ets_qdisc_unbind_tcf, }; static struct Qdisc_ops ets_qdisc_ops __read_mostly = { .cl_ops = &ets_class_ops, .id = "ets", .priv_size = sizeof(struct ets_sched), .enqueue = ets_qdisc_enqueue, .dequeue = ets_qdisc_dequeue, .peek = qdisc_peek_dequeued, .change = ets_qdisc_change, .init = ets_qdisc_init, .reset = ets_qdisc_reset, .destroy = ets_qdisc_destroy, .dump = ets_qdisc_dump, .owner = THIS_MODULE, }; static int __init ets_init(void) { return register_qdisc(&ets_qdisc_ops); } static void __exit ets_exit(void) { unregister_qdisc(&ets_qdisc_ops); } module_init(ets_init); module_exit(ets_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Enhanced Transmission Selection(ETS) scheduler");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2014 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:mac type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/if_ether.h> #include <net/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 #define IPSET_TYPE_REV_MAX 1 /* bucketsize, initval support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:mac"); /* Type specific function prefix */ #define HTYPE hash_mac /* Member elements */ struct hash_mac4_elem { /* Zero valued IP addresses cannot be stored */ union { unsigned char ether[ETH_ALEN]; __be32 foo[2]; }; }; /* Common functions */ static bool hash_mac4_data_equal(const struct hash_mac4_elem *e1, const struct hash_mac4_elem *e2, u32 *multi) { return ether_addr_equal(e1->ether, e2->ether); } static bool hash_mac4_data_list(struct sk_buff *skb, const struct hash_mac4_elem *e) { if (nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_mac4_data_next(struct hash_mac4_elem *next, const struct hash_mac4_elem *e) { } #define MTYPE hash_mac4 #define HOST_MASK 32 #define IP_SET_EMIT_CREATE #define IP_SET_PROTO_UNDEF #include "ip_set_hash_gen.h" static int hash_mac4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_mac4_elem e = { { .foo[0] = 0, .foo[1] = 0 } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (skb_mac_header(skb) < skb->head || (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL; if (opt->flags & IPSET_DIM_ONE_SRC) ether_addr_copy(e.ether, eth_hdr(skb)->h_source); else ether_addr_copy(e.ether, eth_hdr(skb)->h_dest); if (is_zero_ether_addr(e.ether)) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_mac4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_mac4_elem e = { { .foo[0] = 0, .foo[1] = 0 } }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_ETHER] || nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN)) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ether_addr_copy(e.ether, nla_data(tb[IPSET_ATTR_ETHER])); if (is_zero_ether_addr(e.ether)) return -IPSET_ERR_HASH_ELEM; return adtfn(set, &e, &ext, &ext, flags); } static struct ip_set_type hash_mac_type __read_mostly = { .name = "hash:mac", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_MAC, .dimension = IPSET_DIM_ONE, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_mac_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_ETHER] = { .type = NLA_BINARY, .len = ETH_ALEN }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_mac_init(void) { return ip_set_type_register(&hash_mac_type); } static void __exit hash_mac_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_mac_type); } module_init(hash_mac_init); module_exit(hash_mac_fini);
22 7 1 1 5 1 9 1 1040 1 3 3 7 10 2 1 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 // SPDX-License-Identifier: GPL-2.0 /* * Copyright IBM Corporation, 2021 * * Author: Mike Rapoport <rppt@linux.ibm.com> */ #include <linux/mm.h> #include <linux/fs.h> #include <linux/swap.h> #include <linux/mount.h> #include <linux/memfd.h> #include <linux/bitops.h> #include <linux/printk.h> #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/pseudo_fs.h> #include <linux/secretmem.h> #include <linux/set_memory.h> #include <linux/sched/signal.h> #include <uapi/linux/magic.h> #include <asm/tlbflush.h> #include "internal.h" #undef pr_fmt #define pr_fmt(fmt) "secretmem: " fmt /* * Define mode and flag masks to allow validation of the system call * parameters. */ #define SECRETMEM_MODE_MASK (0x0) #define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK static bool secretmem_enable __ro_after_init = 1; module_param_named(enable, secretmem_enable, bool, 0400); MODULE_PARM_DESC(secretmem_enable, "Enable secretmem and memfd_secret(2) system call"); static atomic_t secretmem_users; bool secretmem_active(void) { return !!atomic_read(&secretmem_users); } static vm_fault_t secretmem_fault(struct vm_fault *vmf) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; struct inode *inode = file_inode(vmf->vma->vm_file); pgoff_t offset = vmf->pgoff; gfp_t gfp = vmf->gfp_mask; unsigned long addr; struct page *page; struct folio *folio; vm_fault_t ret; int err; if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) return vmf_error(-EINVAL); filemap_invalidate_lock_shared(mapping); retry: page = find_lock_page(mapping, offset); if (!page) { folio = folio_alloc(gfp | __GFP_ZERO, 0); if (!folio) { ret = VM_FAULT_OOM; goto out; } page = &folio->page; err = set_direct_map_invalid_noflush(page); if (err) { folio_put(folio); ret = vmf_error(err); goto out; } __folio_mark_uptodate(folio); err = filemap_add_folio(mapping, folio, offset, gfp); if (unlikely(err)) { folio_put(folio); /* * If a split of large page was required, it * already happened when we marked the page invalid * which guarantees that this call won't fail */ set_direct_map_default_noflush(page); if (err == -EEXIST) goto retry; ret = vmf_error(err); goto out; } addr = (unsigned long)page_address(page); flush_tlb_kernel_range(addr, addr + PAGE_SIZE); } vmf->page = page; ret = VM_FAULT_LOCKED; out: filemap_invalidate_unlock_shared(mapping); return ret; } static const struct vm_operations_struct secretmem_vm_ops = { .fault = secretmem_fault, }; static int secretmem_release(struct inode *inode, struct file *file) { atomic_dec(&secretmem_users); return 0; } static int secretmem_mmap(struct file *file, struct vm_area_struct *vma) { unsigned long len = vma->vm_end - vma->vm_start; if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) return -EINVAL; if (!mlock_future_ok(vma->vm_mm, vma->vm_flags | VM_LOCKED, len)) return -EAGAIN; vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP); vma->vm_ops = &secretmem_vm_ops; return 0; } bool vma_is_secretmem(struct vm_area_struct *vma) { return vma->vm_ops == &secretmem_vm_ops; } static const struct file_operations secretmem_fops = { .release = secretmem_release, .mmap = secretmem_mmap, }; static int secretmem_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { return -EBUSY; } static void secretmem_free_folio(struct folio *folio) { set_direct_map_default_noflush(&folio->page); folio_zero_segment(folio, 0, folio_size(folio)); } const struct address_space_operations secretmem_aops = { .dirty_folio = noop_dirty_folio, .free_folio = secretmem_free_folio, .migrate_folio = secretmem_migrate_folio, }; static int secretmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); struct address_space *mapping = inode->i_mapping; unsigned int ia_valid = iattr->ia_valid; int ret; filemap_invalidate_lock(mapping); if ((ia_valid & ATTR_SIZE) && inode->i_size) ret = -EINVAL; else ret = simple_setattr(idmap, dentry, iattr); filemap_invalidate_unlock(mapping); return ret; } static const struct inode_operations secretmem_iops = { .setattr = secretmem_setattr, }; static struct vfsmount *secretmem_mnt; static struct file *secretmem_file_create(unsigned long flags) { struct file *file; struct inode *inode; const char *anon_name = "[secretmem]"; const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name)); int err; inode = alloc_anon_inode(secretmem_mnt->mnt_sb); if (IS_ERR(inode)) return ERR_CAST(inode); err = security_inode_init_security_anon(inode, &qname, NULL); if (err) { file = ERR_PTR(err); goto err_free_inode; } file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem", O_RDWR, &secretmem_fops); if (IS_ERR(file)) goto err_free_inode; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); inode->i_op = &secretmem_iops; inode->i_mapping->a_ops = &secretmem_aops; /* pretend we are a normal file with zero size */ inode->i_mode |= S_IFREG; inode->i_size = 0; return file; err_free_inode: iput(inode); return file; } SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) { struct file *file; int fd, err; /* make sure local flags do not confict with global fcntl.h */ BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC); if (!secretmem_enable) return -ENOSYS; if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC)) return -EINVAL; if (atomic_read(&secretmem_users) < 0) return -ENFILE; fd = get_unused_fd_flags(flags & O_CLOEXEC); if (fd < 0) return fd; file = secretmem_file_create(flags); if (IS_ERR(file)) { err = PTR_ERR(file); goto err_put_fd; } file->f_flags |= O_LARGEFILE; atomic_inc(&secretmem_users); fd_install(fd, file); return fd; err_put_fd: put_unused_fd(fd); return err; } static int secretmem_init_fs_context(struct fs_context *fc) { return init_pseudo(fc, SECRETMEM_MAGIC) ? 0 : -ENOMEM; } static struct file_system_type secretmem_fs = { .name = "secretmem", .init_fs_context = secretmem_init_fs_context, .kill_sb = kill_anon_super, }; static int __init secretmem_init(void) { if (!secretmem_enable) return 0; secretmem_mnt = kern_mount(&secretmem_fs); if (IS_ERR(secretmem_mnt)) return PTR_ERR(secretmem_mnt); /* prevent secretmem mappings from ever getting PROT_EXEC */ secretmem_mnt->mnt_flags |= MNT_NOEXEC; return 0; } fs_initcall(secretmem_init);
62 177 1 15 104 1 37 100 17 10 3 65 9 66 14 13 203 160 17 200 3422 3419 173 3423 345 107 12 105 2 107 83 82 3 83 70 72 9 78 29 5 1 22 1 55 2 54 41 2 114 18 101 132 51 41 10 114 113 2 111 2 4 4 2 2 99 97 99 7 80 5 59 42 1 21 26 4 6 9 18 24 17 2 17 6 15 10 7 7 7 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 // SPDX-License-Identifier: GPL-2.0 /* * linux/mm/mlock.c * * (C) Copyright 1995 Linus Torvalds * (C) Copyright 2002 Christoph Hellwig */ #include <linux/capability.h> #include <linux/mman.h> #include <linux/mm.h> #include <linux/sched/user.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/pagemap.h> #include <linux/pagevec.h> #include <linux/pagewalk.h> #include <linux/mempolicy.h> #include <linux/syscalls.h> #include <linux/sched.h> #include <linux/export.h> #include <linux/rmap.h> #include <linux/mmzone.h> #include <linux/hugetlb.h> #include <linux/memcontrol.h> #include <linux/mm_inline.h> #include <linux/secretmem.h> #include "internal.h" struct mlock_fbatch { local_lock_t lock; struct folio_batch fbatch; }; static DEFINE_PER_CPU(struct mlock_fbatch, mlock_fbatch) = { .lock = INIT_LOCAL_LOCK(lock), }; bool can_do_mlock(void) { if (rlimit(RLIMIT_MEMLOCK) != 0) return true; if (capable(CAP_IPC_LOCK)) return true; return false; } EXPORT_SYMBOL(can_do_mlock); /* * Mlocked folios are marked with the PG_mlocked flag for efficient testing * in vmscan and, possibly, the fault path; and to support semi-accurate * statistics. * * An mlocked folio [folio_test_mlocked(folio)] is unevictable. As such, it * will be ostensibly placed on the LRU "unevictable" list (actually no such * list exists), rather than the [in]active lists. PG_unevictable is set to * indicate the unevictable state. */ static struct lruvec *__mlock_folio(struct folio *folio, struct lruvec *lruvec) { /* There is nothing more we can do while it's off LRU */ if (!folio_test_clear_lru(folio)) return lruvec; lruvec = folio_lruvec_relock_irq(folio, lruvec); if (unlikely(folio_evictable(folio))) { /* * This is a little surprising, but quite possible: PG_mlocked * must have got cleared already by another CPU. Could this * folio be unevictable? I'm not sure, but move it now if so. */ if (folio_test_unevictable(folio)) { lruvec_del_folio(lruvec, folio); folio_clear_unevictable(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(UNEVICTABLE_PGRESCUED, folio_nr_pages(folio)); } goto out; } if (folio_test_unevictable(folio)) { if (folio_test_mlocked(folio)) folio->mlock_count++; goto out; } lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_set_unevictable(folio); folio->mlock_count = !!folio_test_mlocked(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio)); out: folio_set_lru(folio); return lruvec; } static struct lruvec *__mlock_new_folio(struct folio *folio, struct lruvec *lruvec) { VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); lruvec = folio_lruvec_relock_irq(folio, lruvec); /* As above, this is a little surprising, but possible */ if (unlikely(folio_evictable(folio))) goto out; folio_set_unevictable(folio); folio->mlock_count = !!folio_test_mlocked(folio); __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio)); out: lruvec_add_folio(lruvec, folio); folio_set_lru(folio); return lruvec; } static struct lruvec *__munlock_folio(struct folio *folio, struct lruvec *lruvec) { int nr_pages = folio_nr_pages(folio); bool isolated = false; if (!folio_test_clear_lru(folio)) goto munlock; isolated = true; lruvec = folio_lruvec_relock_irq(folio, lruvec); if (folio_test_unevictable(folio)) { /* Then mlock_count is maintained, but might undercount */ if (folio->mlock_count) folio->mlock_count--; if (folio->mlock_count) goto out; } /* else assume that was the last mlock: reclaim will fix it if not */ munlock: if (folio_test_clear_mlocked(folio)) { __zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages); if (isolated || !folio_test_unevictable(folio)) __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); else __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); } /* folio_evictable() has to be checked *after* clearing Mlocked */ if (isolated && folio_test_unevictable(folio) && folio_evictable(folio)) { lruvec_del_folio(lruvec, folio); folio_clear_unevictable(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } out: if (isolated) folio_set_lru(folio); return lruvec; } /* * Flags held in the low bits of a struct folio pointer on the mlock_fbatch. */ #define LRU_FOLIO 0x1 #define NEW_FOLIO 0x2 static inline struct folio *mlock_lru(struct folio *folio) { return (struct folio *)((unsigned long)folio + LRU_FOLIO); } static inline struct folio *mlock_new(struct folio *folio) { return (struct folio *)((unsigned long)folio + NEW_FOLIO); } /* * mlock_folio_batch() is derived from folio_batch_move_lru(): perhaps that can * make use of such folio pointer flags in future, but for now just keep it for * mlock. We could use three separate folio batches instead, but one feels * better (munlocking a full folio batch does not need to drain mlocking folio * batches first). */ static void mlock_folio_batch(struct folio_batch *fbatch) { struct lruvec *lruvec = NULL; unsigned long mlock; struct folio *folio; int i; for (i = 0; i < folio_batch_count(fbatch); i++) { folio = fbatch->folios[i]; mlock = (unsigned long)folio & (LRU_FOLIO | NEW_FOLIO); folio = (struct folio *)((unsigned long)folio - mlock); fbatch->folios[i] = folio; if (mlock & LRU_FOLIO) lruvec = __mlock_folio(folio, lruvec); else if (mlock & NEW_FOLIO) lruvec = __mlock_new_folio(folio, lruvec); else lruvec = __munlock_folio(folio, lruvec); } if (lruvec) unlock_page_lruvec_irq(lruvec); folios_put(fbatch->folios, folio_batch_count(fbatch)); folio_batch_reinit(fbatch); } void mlock_drain_local(void) { struct folio_batch *fbatch; local_lock(&mlock_fbatch.lock); fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); if (folio_batch_count(fbatch)) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } void mlock_drain_remote(int cpu) { struct folio_batch *fbatch; WARN_ON_ONCE(cpu_online(cpu)); fbatch = &per_cpu(mlock_fbatch.fbatch, cpu); if (folio_batch_count(fbatch)) mlock_folio_batch(fbatch); } bool need_mlock_drain(int cpu) { return folio_batch_count(&per_cpu(mlock_fbatch.fbatch, cpu)); } /** * mlock_folio - mlock a folio already on (or temporarily off) LRU * @folio: folio to be mlocked. */ void mlock_folio(struct folio *folio) { struct folio_batch *fbatch; local_lock(&mlock_fbatch.lock); fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); if (!folio_test_set_mlocked(folio)) { int nr_pages = folio_nr_pages(folio); zone_stat_mod_folio(folio, NR_MLOCK, nr_pages); __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); } folio_get(folio); if (!folio_batch_add(fbatch, mlock_lru(folio)) || folio_test_large(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } /** * mlock_new_folio - mlock a newly allocated folio not yet on LRU * @folio: folio to be mlocked, either normal or a THP head. */ void mlock_new_folio(struct folio *folio) { struct folio_batch *fbatch; int nr_pages = folio_nr_pages(folio); local_lock(&mlock_fbatch.lock); fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); folio_set_mlocked(folio); zone_stat_mod_folio(folio, NR_MLOCK, nr_pages); __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); folio_get(folio); if (!folio_batch_add(fbatch, mlock_new(folio)) || folio_test_large(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } /** * munlock_folio - munlock a folio * @folio: folio to be munlocked, either normal or a THP head. */ void munlock_folio(struct folio *folio) { struct folio_batch *fbatch; local_lock(&mlock_fbatch.lock); fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); /* * folio_test_clear_mlocked(folio) must be left to __munlock_folio(), * which will check whether the folio is multiply mlocked. */ folio_get(folio); if (!folio_batch_add(fbatch, folio) || folio_test_large(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } static inline unsigned int folio_mlock_step(struct folio *folio, pte_t *pte, unsigned long addr, unsigned long end) { unsigned int count, i, nr = folio_nr_pages(folio); unsigned long pfn = folio_pfn(folio); pte_t ptent = ptep_get(pte); if (!folio_test_large(folio)) return 1; count = pfn + nr - pte_pfn(ptent); count = min_t(unsigned int, count, (end - addr) >> PAGE_SHIFT); for (i = 0; i < count; i++, pte++) { pte_t entry = ptep_get(pte); if (!pte_present(entry)) break; if (pte_pfn(entry) - pfn >= nr) break; } return i; } static inline bool allow_mlock_munlock(struct folio *folio, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned int step) { /* * For unlock, allow munlock large folio which is partially * mapped to VMA. As it's possible that large folio is * mlocked and VMA is split later. * * During memory pressure, such kind of large folio can * be split. And the pages are not in VM_LOCKed VMA * can be reclaimed. */ if (!(vma->vm_flags & VM_LOCKED)) return true; /* folio_within_range() cannot take KSM, but any small folio is OK */ if (!folio_test_large(folio)) return true; /* folio not in range [start, end), skip mlock */ if (!folio_within_range(folio, vma, start, end)) return false; /* folio is not fully mapped, skip mlock */ if (step != folio_nr_pages(folio)) return false; return true; } static int mlock_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *start_pte, *pte; pte_t ptent; struct folio *folio; unsigned int step = 1; unsigned long start = addr; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { if (!pmd_present(*pmd)) goto out; if (is_huge_zero_pmd(*pmd)) goto out; folio = page_folio(pmd_page(*pmd)); if (vma->vm_flags & VM_LOCKED) mlock_folio(folio); else munlock_folio(folio); goto out; } start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!start_pte) { walk->action = ACTION_AGAIN; return 0; } for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) { ptent = ptep_get(pte); if (!pte_present(ptent)) continue; folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; step = folio_mlock_step(folio, pte, addr, end); if (!allow_mlock_munlock(folio, vma, start, end, step)) goto next_entry; if (vma->vm_flags & VM_LOCKED) mlock_folio(folio); else munlock_folio(folio); next_entry: pte += step - 1; addr += (step - 1) << PAGE_SHIFT; } pte_unmap(start_pte); out: spin_unlock(ptl); cond_resched(); return 0; } /* * mlock_vma_pages_range() - mlock any pages already in the range, * or munlock all pages in the range. * @vma - vma containing range to be mlock()ed or munlock()ed * @start - start address in @vma of the range * @end - end of range in @vma * @newflags - the new set of flags for @vma. * * Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED; * called for munlock() and munlockall(), to clear VM_LOCKED from @vma. */ static void mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, vm_flags_t newflags) { static const struct mm_walk_ops mlock_walk_ops = { .pmd_entry = mlock_pte_range, .walk_lock = PGWALK_WRLOCK_VERIFY, }; /* * There is a slight chance that concurrent page migration, * or page reclaim finding a page of this now-VM_LOCKED vma, * will call mlock_vma_folio() and raise page's mlock_count: * double counting, leaving the page unevictable indefinitely. * Communicate this danger to mlock_vma_folio() with VM_IO, * which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas. * mmap_lock is held in write mode here, so this weird * combination should not be visible to other mmap_lock users; * but WRITE_ONCE so rmap walkers must see VM_IO if VM_LOCKED. */ if (newflags & VM_LOCKED) newflags |= VM_IO; vma_start_write(vma); vm_flags_reset_once(vma, newflags); lru_add_drain(); walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL); lru_add_drain(); if (newflags & VM_IO) { newflags &= ~VM_IO; vm_flags_reset_once(vma, newflags); } } /* * mlock_fixup - handle mlock[all]/munlock[all] requests. * * Filters out "special" vmas -- VM_LOCKED never gets set for these, and * munlock is a no-op. However, for some special vmas, we go ahead and * populate the ptes. * * For vmas that pass the filters, merge/split as appropriate. */ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, vm_flags_t newflags) { struct mm_struct *mm = vma->vm_mm; int nr_pages; int ret = 0; vm_flags_t oldflags = vma->vm_flags; if (newflags == oldflags || (oldflags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || vma_is_dax(vma) || vma_is_secretmem(vma)) /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ goto out; vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; } /* * Keep track of amount of locked VM. */ nr_pages = (end - start) >> PAGE_SHIFT; if (!(newflags & VM_LOCKED)) nr_pages = -nr_pages; else if (oldflags & VM_LOCKED) nr_pages = 0; mm->locked_vm += nr_pages; /* * vm_flags is protected by the mmap_lock held in write mode. * It's okay if try_to_unmap_one unmaps a page just after we * set VM_LOCKED, populate_vma_page_range will bring it back. */ if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) { /* No work to do, and mlocking twice would be wrong */ vma_start_write(vma); vm_flags_reset(vma, newflags); } else { mlock_vma_pages_range(vma, start, end, newflags); } out: *prev = vma; return ret; } static int apply_vma_lock_flags(unsigned long start, size_t len, vm_flags_t flags) { unsigned long nstart, end, tmp; struct vm_area_struct *vma, *prev; VMA_ITERATOR(vmi, current->mm, start); VM_BUG_ON(offset_in_page(start)); VM_BUG_ON(len != PAGE_ALIGN(len)); end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; vma = vma_iter_load(&vmi); if (!vma) return -ENOMEM; prev = vma_prev(&vmi); if (start > vma->vm_start) prev = vma; nstart = start; tmp = vma->vm_start; for_each_vma_range(vmi, vma, end) { int error; vm_flags_t newflags; if (vma->vm_start != tmp) return -ENOMEM; newflags = vma->vm_flags & ~VM_LOCKED_MASK; newflags |= flags; /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ tmp = vma->vm_end; if (tmp > end) tmp = end; error = mlock_fixup(&vmi, vma, &prev, nstart, tmp, newflags); if (error) return error; tmp = vma_iter_end(&vmi); nstart = tmp; } if (tmp < end) return -ENOMEM; return 0; } /* * Go through vma areas and sum size of mlocked * vma pages, as return value. * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT) * is also counted. * Return value: previously mlocked page counts */ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm, unsigned long start, size_t len) { struct vm_area_struct *vma; unsigned long count = 0; unsigned long end; VMA_ITERATOR(vmi, mm, start); /* Don't overflow past ULONG_MAX */ if (unlikely(ULONG_MAX - len < start)) end = ULONG_MAX; else end = start + len; for_each_vma_range(vmi, vma, end) { if (vma->vm_flags & VM_LOCKED) { if (start > vma->vm_start) count -= (start - vma->vm_start); if (end < vma->vm_end) { count += end - vma->vm_start; break; } count += vma->vm_end - vma->vm_start; } } return count >> PAGE_SHIFT; } /* * convert get_user_pages() return value to posix mlock() error */ static int __mlock_posix_error_return(long retval) { if (retval == -EFAULT) retval = -ENOMEM; else if (retval == -ENOMEM) retval = -EAGAIN; return retval; } static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags) { unsigned long locked; unsigned long lock_limit; int error = -ENOMEM; start = untagged_addr(start); if (!can_do_mlock()) return -EPERM; len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; locked = len >> PAGE_SHIFT; if (mmap_write_lock_killable(current->mm)) return -EINTR; locked += current->mm->locked_vm; if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) { /* * It is possible that the regions requested intersect with * previously mlocked areas, that part area in "mm->locked_vm" * should not be counted to new mlock increment count. So check * and adjust locked count if necessary. */ locked -= count_mm_mlocked_page_nr(current->mm, start, len); } /* check against resource limits */ if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) error = apply_vma_lock_flags(start, len, flags); mmap_write_unlock(current->mm); if (error) return error; error = __mm_populate(start, len, 0); if (error) return __mlock_posix_error_return(error); return 0; } SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) { return do_mlock(start, len, VM_LOCKED); } SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags) { vm_flags_t vm_flags = VM_LOCKED; if (flags & ~MLOCK_ONFAULT) return -EINVAL; if (flags & MLOCK_ONFAULT) vm_flags |= VM_LOCKONFAULT; return do_mlock(start, len, vm_flags); } SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; start = untagged_addr(start); len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; if (mmap_write_lock_killable(current->mm)) return -EINTR; ret = apply_vma_lock_flags(start, len, 0); mmap_write_unlock(current->mm); return ret; } /* * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall) * and translate into the appropriate modifications to mm->def_flags and/or the * flags for all current VMAs. * * There are a couple of subtleties with this. If mlockall() is called multiple * times with different flags, the values do not necessarily stack. If mlockall * is called once including the MCL_FUTURE flag and then a second time without * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags. */ static int apply_mlockall_flags(int flags) { VMA_ITERATOR(vmi, current->mm, 0); struct vm_area_struct *vma, *prev = NULL; vm_flags_t to_add = 0; current->mm->def_flags &= ~VM_LOCKED_MASK; if (flags & MCL_FUTURE) { current->mm->def_flags |= VM_LOCKED; if (flags & MCL_ONFAULT) current->mm->def_flags |= VM_LOCKONFAULT; if (!(flags & MCL_CURRENT)) goto out; } if (flags & MCL_CURRENT) { to_add |= VM_LOCKED; if (flags & MCL_ONFAULT) to_add |= VM_LOCKONFAULT; } for_each_vma(vmi, vma) { vm_flags_t newflags; newflags = vma->vm_flags & ~VM_LOCKED_MASK; newflags |= to_add; /* Ignore errors */ mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end, newflags); cond_resched(); } out: return 0; } SYSCALL_DEFINE1(mlockall, int, flags) { unsigned long lock_limit; int ret; if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) || flags == MCL_ONFAULT) return -EINVAL; if (!can_do_mlock()) return -EPERM; lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; if (mmap_write_lock_killable(current->mm)) return -EINTR; ret = -ENOMEM; if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || capable(CAP_IPC_LOCK)) ret = apply_mlockall_flags(flags); mmap_write_unlock(current->mm); if (!ret && (flags & MCL_CURRENT)) mm_populate(0, TASK_SIZE); return ret; } SYSCALL_DEFINE0(munlockall) { int ret; if (mmap_write_lock_killable(current->mm)) return -EINTR; ret = apply_mlockall_flags(0); mmap_write_unlock(current->mm); return ret; } /* * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB * shm segments) get accounted against the user_struct instead. */ static DEFINE_SPINLOCK(shmlock_user_lock); int user_shm_lock(size_t size, struct ucounts *ucounts) { unsigned long lock_limit, locked; long memlock; int allowed = 0; locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; lock_limit = rlimit(RLIMIT_MEMLOCK); if (lock_limit != RLIM_INFINITY) lock_limit >>= PAGE_SHIFT; spin_lock(&shmlock_user_lock); memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); if ((memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) { dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); goto out; } if (!get_ucounts(ucounts)) { dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); allowed = 0; goto out; } allowed = 1; out: spin_unlock(&shmlock_user_lock); return allowed; } void user_shm_unlock(size_t size, struct ucounts *ucounts) { spin_lock(&shmlock_user_lock); dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT); spin_unlock(&shmlock_user_lock); put_ucounts(ucounts); }
1 16 1 1 2 9 3 7 1 4 10 1 2 8 1 1 6 2 6 2 5 3 4 4 5 3 7 8 8 6 7 6 6 6 3 8 1 8 1 1 8 8 1 1 1 1 1 407 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, Intel Corporation. * * Author: Alexander Duyck <alexander.h.duyck@intel.com> */ #include <linux/module.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/dsfield.h> #include <net/pkt_cls.h> #include <net/tc_wrapper.h> #include <linux/tc_act/tc_skbedit.h> #include <net/tc_act/tc_skbedit.h> static struct tc_action_ops act_skbedit_ops; static u16 tcf_skbedit_hash(struct tcf_skbedit_params *params, struct sk_buff *skb) { u16 queue_mapping = params->queue_mapping; if (params->flags & SKBEDIT_F_TXQ_SKBHASH) { u32 hash = skb_get_hash(skb); queue_mapping += hash % params->mapping_mod; } return netdev_cap_txqueue(skb->dev, queue_mapping); } TC_INDIRECT_SCOPE int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_skbedit *d = to_skbedit(a); struct tcf_skbedit_params *params; int action; tcf_lastuse_update(&d->tcf_tm); bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb); params = rcu_dereference_bh(d->params); action = READ_ONCE(d->tcf_action); if (params->flags & SKBEDIT_F_PRIORITY) skb->priority = params->priority; if (params->flags & SKBEDIT_F_INHERITDSFIELD) { int wlen = skb_network_offset(skb); switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): wlen += sizeof(struct iphdr); if (!pskb_may_pull(skb, wlen)) goto err; skb->priority = ipv4_get_dsfield(ip_hdr(skb)) >> 2; break; case htons(ETH_P_IPV6): wlen += sizeof(struct ipv6hdr); if (!pskb_may_pull(skb, wlen)) goto err; skb->priority = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2; break; } } if (params->flags & SKBEDIT_F_QUEUE_MAPPING && skb->dev->real_num_tx_queues > params->queue_mapping) { #ifdef CONFIG_NET_EGRESS netdev_xmit_skip_txqueue(true); #endif skb_set_queue_mapping(skb, tcf_skbedit_hash(params, skb)); } if (params->flags & SKBEDIT_F_MARK) { skb->mark &= ~params->mask; skb->mark |= params->mark & params->mask; } if (params->flags & SKBEDIT_F_PTYPE) skb->pkt_type = params->ptype; return action; err: qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats)); return TC_ACT_SHOT; } static void tcf_skbedit_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_skbedit *d = to_skbedit(a); struct tcf_t *tm = &d->tcf_tm; tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { [TCA_SKBEDIT_PARMS] = { .len = sizeof(struct tc_skbedit) }, [TCA_SKBEDIT_PRIORITY] = { .len = sizeof(u32) }, [TCA_SKBEDIT_QUEUE_MAPPING] = { .len = sizeof(u16) }, [TCA_SKBEDIT_MARK] = { .len = sizeof(u32) }, [TCA_SKBEDIT_PTYPE] = { .len = sizeof(u16) }, [TCA_SKBEDIT_MASK] = { .len = sizeof(u32) }, [TCA_SKBEDIT_FLAGS] = { .len = sizeof(u64) }, [TCA_SKBEDIT_QUEUE_MAPPING_MAX] = { .len = sizeof(u16) }, }; static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 act_flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_skbedit_ops.net_id); bool bind = act_flags & TCA_ACT_FLAGS_BIND; struct tcf_skbedit_params *params_new; struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tc_skbedit *parm; struct tcf_skbedit *d; u32 flags = 0, *priority = NULL, *mark = NULL, *mask = NULL; u16 *queue_mapping = NULL, *ptype = NULL; u16 mapping_mod = 1; bool exists = false; int ret = 0, err; u32 index; if (nla == NULL) return -EINVAL; err = nla_parse_nested_deprecated(tb, TCA_SKBEDIT_MAX, nla, skbedit_policy, NULL); if (err < 0) return err; if (tb[TCA_SKBEDIT_PARMS] == NULL) return -EINVAL; if (tb[TCA_SKBEDIT_PRIORITY] != NULL) { flags |= SKBEDIT_F_PRIORITY; priority = nla_data(tb[TCA_SKBEDIT_PRIORITY]); } if (tb[TCA_SKBEDIT_QUEUE_MAPPING] != NULL) { if (is_tcf_skbedit_ingress(act_flags) && !(act_flags & TCA_ACT_FLAGS_SKIP_SW)) { NL_SET_ERR_MSG_MOD(extack, "\"queue_mapping\" option on receive side is hardware only, use skip_sw"); return -EOPNOTSUPP; } flags |= SKBEDIT_F_QUEUE_MAPPING; queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]); } if (tb[TCA_SKBEDIT_PTYPE] != NULL) { ptype = nla_data(tb[TCA_SKBEDIT_PTYPE]); if (!skb_pkt_type_ok(*ptype)) return -EINVAL; flags |= SKBEDIT_F_PTYPE; } if (tb[TCA_SKBEDIT_MARK] != NULL) { flags |= SKBEDIT_F_MARK; mark = nla_data(tb[TCA_SKBEDIT_MARK]); } if (tb[TCA_SKBEDIT_MASK] != NULL) { flags |= SKBEDIT_F_MASK; mask = nla_data(tb[TCA_SKBEDIT_MASK]); } if (tb[TCA_SKBEDIT_FLAGS] != NULL) { u64 *pure_flags = nla_data(tb[TCA_SKBEDIT_FLAGS]); if (*pure_flags & SKBEDIT_F_TXQ_SKBHASH) { u16 *queue_mapping_max; if (!tb[TCA_SKBEDIT_QUEUE_MAPPING] || !tb[TCA_SKBEDIT_QUEUE_MAPPING_MAX]) { NL_SET_ERR_MSG_MOD(extack, "Missing required range of queue_mapping."); return -EINVAL; } queue_mapping_max = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING_MAX]); if (*queue_mapping_max < *queue_mapping) { NL_SET_ERR_MSG_MOD(extack, "The range of queue_mapping is invalid, max < min."); return -EINVAL; } mapping_mod = *queue_mapping_max - *queue_mapping + 1; flags |= SKBEDIT_F_TXQ_SKBHASH; } if (*pure_flags & SKBEDIT_F_INHERITDSFIELD) flags |= SKBEDIT_F_INHERITDSFIELD; } parm = nla_data(tb[TCA_SKBEDIT_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; exists = err; if (exists && bind) return ACT_P_BOUND; if (!flags) { if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, index, est, a, &act_skbedit_ops, bind, true, act_flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } d = to_skbedit(*a); ret = ACT_P_CREATED; } else { d = to_skbedit(*a); if (!(act_flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { err = -ENOMEM; goto put_chain; } params_new->flags = flags; if (flags & SKBEDIT_F_PRIORITY) params_new->priority = *priority; if (flags & SKBEDIT_F_QUEUE_MAPPING) { params_new->queue_mapping = *queue_mapping; params_new->mapping_mod = mapping_mod; } if (flags & SKBEDIT_F_MARK) params_new->mark = *mark; if (flags & SKBEDIT_F_PTYPE) params_new->ptype = *ptype; /* default behaviour is to use all the bits */ params_new->mask = 0xffffffff; if (flags & SKBEDIT_F_MASK) params_new->mask = *mask; spin_lock_bh(&d->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params_new = rcu_replace_pointer(d->params, params_new, lockdep_is_held(&d->tcf_lock)); spin_unlock_bh(&d->tcf_lock); if (params_new) kfree_rcu(params_new, rcu); if (goto_ch) tcf_chain_put_by_act(goto_ch); return ret; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: tcf_idr_release(*a, bind); return err; } static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_skbedit *d = to_skbedit(a); struct tcf_skbedit_params *params; struct tc_skbedit opt = { .index = d->tcf_index, .refcnt = refcount_read(&d->tcf_refcnt) - ref, .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, }; u64 pure_flags = 0; struct tcf_t t; spin_lock_bh(&d->tcf_lock); params = rcu_dereference_protected(d->params, lockdep_is_held(&d->tcf_lock)); opt.action = d->tcf_action; if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; if ((params->flags & SKBEDIT_F_PRIORITY) && nla_put_u32(skb, TCA_SKBEDIT_PRIORITY, params->priority)) goto nla_put_failure; if ((params->flags & SKBEDIT_F_QUEUE_MAPPING) && nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING, params->queue_mapping)) goto nla_put_failure; if ((params->flags & SKBEDIT_F_MARK) && nla_put_u32(skb, TCA_SKBEDIT_MARK, params->mark)) goto nla_put_failure; if ((params->flags & SKBEDIT_F_PTYPE) && nla_put_u16(skb, TCA_SKBEDIT_PTYPE, params->ptype)) goto nla_put_failure; if ((params->flags & SKBEDIT_F_MASK) && nla_put_u32(skb, TCA_SKBEDIT_MASK, params->mask)) goto nla_put_failure; if (params->flags & SKBEDIT_F_INHERITDSFIELD) pure_flags |= SKBEDIT_F_INHERITDSFIELD; if (params->flags & SKBEDIT_F_TXQ_SKBHASH) { if (nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING_MAX, params->queue_mapping + params->mapping_mod - 1)) goto nla_put_failure; pure_flags |= SKBEDIT_F_TXQ_SKBHASH; } if (pure_flags != 0 && nla_put(skb, TCA_SKBEDIT_FLAGS, sizeof(pure_flags), &pure_flags)) goto nla_put_failure; tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD)) goto nla_put_failure; spin_unlock_bh(&d->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&d->tcf_lock); nlmsg_trim(skb, b); return -1; } static void tcf_skbedit_cleanup(struct tc_action *a) { struct tcf_skbedit *d = to_skbedit(a); struct tcf_skbedit_params *params; params = rcu_dereference_protected(d->params, 1); if (params) kfree_rcu(params, rcu); } static size_t tcf_skbedit_get_fill_size(const struct tc_action *act) { return nla_total_size(sizeof(struct tc_skbedit)) + nla_total_size(sizeof(u32)) /* TCA_SKBEDIT_PRIORITY */ + nla_total_size(sizeof(u16)) /* TCA_SKBEDIT_QUEUE_MAPPING */ + nla_total_size(sizeof(u16)) /* TCA_SKBEDIT_QUEUE_MAPPING_MAX */ + nla_total_size(sizeof(u32)) /* TCA_SKBEDIT_MARK */ + nla_total_size(sizeof(u16)) /* TCA_SKBEDIT_PTYPE */ + nla_total_size(sizeof(u32)) /* TCA_SKBEDIT_MASK */ + nla_total_size_64bit(sizeof(u64)); /* TCA_SKBEDIT_FLAGS */ } static int tcf_skbedit_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { if (bind) { struct flow_action_entry *entry = entry_data; if (is_tcf_skbedit_mark(act)) { entry->id = FLOW_ACTION_MARK; entry->mark = tcf_skbedit_mark(act); } else if (is_tcf_skbedit_ptype(act)) { entry->id = FLOW_ACTION_PTYPE; entry->ptype = tcf_skbedit_ptype(act); } else if (is_tcf_skbedit_priority(act)) { entry->id = FLOW_ACTION_PRIORITY; entry->priority = tcf_skbedit_priority(act); } else if (is_tcf_skbedit_tx_queue_mapping(act)) { NL_SET_ERR_MSG_MOD(extack, "Offload not supported when \"queue_mapping\" option is used on transmit side"); return -EOPNOTSUPP; } else if (is_tcf_skbedit_rx_queue_mapping(act)) { entry->id = FLOW_ACTION_RX_QUEUE_MAPPING; entry->rx_queue = tcf_skbedit_rx_queue_mapping(act); } else if (is_tcf_skbedit_inheritdsfield(act)) { NL_SET_ERR_MSG_MOD(extack, "Offload not supported when \"inheritdsfield\" option is used"); return -EOPNOTSUPP; } else { NL_SET_ERR_MSG_MOD(extack, "Unsupported skbedit option offload"); return -EOPNOTSUPP; } *index_inc = 1; } else { struct flow_offload_action *fl_action = entry_data; if (is_tcf_skbedit_mark(act)) fl_action->id = FLOW_ACTION_MARK; else if (is_tcf_skbedit_ptype(act)) fl_action->id = FLOW_ACTION_PTYPE; else if (is_tcf_skbedit_priority(act)) fl_action->id = FLOW_ACTION_PRIORITY; else if (is_tcf_skbedit_rx_queue_mapping(act)) fl_action->id = FLOW_ACTION_RX_QUEUE_MAPPING; else return -EOPNOTSUPP; } return 0; } static struct tc_action_ops act_skbedit_ops = { .kind = "skbedit", .id = TCA_ID_SKBEDIT, .owner = THIS_MODULE, .act = tcf_skbedit_act, .stats_update = tcf_skbedit_stats_update, .dump = tcf_skbedit_dump, .init = tcf_skbedit_init, .cleanup = tcf_skbedit_cleanup, .get_fill_size = tcf_skbedit_get_fill_size, .offload_act_setup = tcf_skbedit_offload_act_setup, .size = sizeof(struct tcf_skbedit), }; static __net_init int skbedit_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_skbedit_ops.net_id); return tc_action_net_init(net, tn, &act_skbedit_ops); } static void __net_exit skbedit_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_skbedit_ops.net_id); } static struct pernet_operations skbedit_net_ops = { .init = skbedit_init_net, .exit_batch = skbedit_exit_net, .id = &act_skbedit_ops.net_id, .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>"); MODULE_DESCRIPTION("SKB Editing"); MODULE_LICENSE("GPL"); static int __init skbedit_init_module(void) { return tcf_register_action(&act_skbedit_ops, &skbedit_net_ops); } static void __exit skbedit_cleanup_module(void) { tcf_unregister_action(&act_skbedit_ops, &skbedit_net_ops); } module_init(skbedit_init_module); module_exit(skbedit_cleanup_module);
open /syzkaller/managers/ci-upstream-kmsan-gce-root/kernel/drivers/net/wireless/legacy/rndis_wlan.c: no such file or directory
34 1 1 8 1 1 2 5 1 1 32 5 6 1 491 468 31 31 38 37 24 35 1 1 33 939 925 8 6 5 467 281 279 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 // SPDX-License-Identifier: GPL-2.0 /* * Key setup facility for FS encryption support. * * Copyright (C) 2015, Google, Inc. * * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar. * Heavily modified since then. */ #include <crypto/skcipher.h> #include <linux/random.h> #include "fscrypt_private.h" struct fscrypt_mode fscrypt_modes[] = { [FSCRYPT_MODE_AES_256_XTS] = { .friendly_name = "AES-256-XTS", .cipher_str = "xts(aes)", .keysize = 64, .security_strength = 32, .ivsize = 16, .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_256_XTS, }, [FSCRYPT_MODE_AES_256_CTS] = { .friendly_name = "AES-256-CTS-CBC", .cipher_str = "cts(cbc(aes))", .keysize = 32, .security_strength = 32, .ivsize = 16, }, [FSCRYPT_MODE_AES_128_CBC] = { .friendly_name = "AES-128-CBC-ESSIV", .cipher_str = "essiv(cbc(aes),sha256)", .keysize = 16, .security_strength = 16, .ivsize = 16, .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV, }, [FSCRYPT_MODE_AES_128_CTS] = { .friendly_name = "AES-128-CTS-CBC", .cipher_str = "cts(cbc(aes))", .keysize = 16, .security_strength = 16, .ivsize = 16, }, [FSCRYPT_MODE_SM4_XTS] = { .friendly_name = "SM4-XTS", .cipher_str = "xts(sm4)", .keysize = 32, .security_strength = 16, .ivsize = 16, .blk_crypto_mode = BLK_ENCRYPTION_MODE_SM4_XTS, }, [FSCRYPT_MODE_SM4_CTS] = { .friendly_name = "SM4-CTS-CBC", .cipher_str = "cts(cbc(sm4))", .keysize = 16, .security_strength = 16, .ivsize = 16, }, [FSCRYPT_MODE_ADIANTUM] = { .friendly_name = "Adiantum", .cipher_str = "adiantum(xchacha12,aes)", .keysize = 32, .security_strength = 32, .ivsize = 32, .blk_crypto_mode = BLK_ENCRYPTION_MODE_ADIANTUM, }, [FSCRYPT_MODE_AES_256_HCTR2] = { .friendly_name = "AES-256-HCTR2", .cipher_str = "hctr2(aes)", .keysize = 32, .security_strength = 32, .ivsize = 32, }, }; static DEFINE_MUTEX(fscrypt_mode_key_setup_mutex); static struct fscrypt_mode * select_encryption_mode(const union fscrypt_policy *policy, const struct inode *inode) { BUILD_BUG_ON(ARRAY_SIZE(fscrypt_modes) != FSCRYPT_MODE_MAX + 1); if (S_ISREG(inode->i_mode)) return &fscrypt_modes[fscrypt_policy_contents_mode(policy)]; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) return &fscrypt_modes[fscrypt_policy_fnames_mode(policy)]; WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", inode->i_ino, (inode->i_mode & S_IFMT)); return ERR_PTR(-EINVAL); } /* Create a symmetric cipher object for the given encryption mode and key */ static struct crypto_skcipher * fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key, const struct inode *inode) { struct crypto_skcipher *tfm; int err; tfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0); if (IS_ERR(tfm)) { if (PTR_ERR(tfm) == -ENOENT) { fscrypt_warn(inode, "Missing crypto API support for %s (API name: \"%s\")", mode->friendly_name, mode->cipher_str); return ERR_PTR(-ENOPKG); } fscrypt_err(inode, "Error allocating '%s' transform: %ld", mode->cipher_str, PTR_ERR(tfm)); return tfm; } if (!xchg(&mode->logged_cryptoapi_impl, 1)) { /* * fscrypt performance can vary greatly depending on which * crypto algorithm implementation is used. Help people debug * performance problems by logging the ->cra_driver_name the * first time a mode is used. */ pr_info("fscrypt: %s using implementation \"%s\"\n", mode->friendly_name, crypto_skcipher_driver_name(tfm)); } if (WARN_ON_ONCE(crypto_skcipher_ivsize(tfm) != mode->ivsize)) { err = -EINVAL; goto err_free_tfm; } crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize); if (err) goto err_free_tfm; return tfm; err_free_tfm: crypto_free_skcipher(tfm); return ERR_PTR(err); } /* * Prepare the crypto transform object or blk-crypto key in @prep_key, given the * raw key, encryption mode (@ci->ci_mode), flag indicating which encryption * implementation (fs-layer or blk-crypto) will be used (@ci->ci_inlinecrypt), * and IV generation method (@ci->ci_policy.flags). */ int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_inode_info *ci) { struct crypto_skcipher *tfm; if (fscrypt_using_inline_encryption(ci)) return fscrypt_prepare_inline_crypt_key(prep_key, raw_key, ci); tfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key, ci->ci_inode); if (IS_ERR(tfm)) return PTR_ERR(tfm); /* * Pairs with the smp_load_acquire() in fscrypt_is_key_prepared(). * I.e., here we publish ->tfm with a RELEASE barrier so that * concurrent tasks can ACQUIRE it. Note that this concurrency is only * possible for per-mode keys, not for per-file keys. */ smp_store_release(&prep_key->tfm, tfm); return 0; } /* Destroy a crypto transform object and/or blk-crypto key. */ void fscrypt_destroy_prepared_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key) { crypto_free_skcipher(prep_key->tfm); fscrypt_destroy_inline_crypt_key(sb, prep_key); memzero_explicit(prep_key, sizeof(*prep_key)); } /* Given a per-file encryption key, set up the file's crypto transform object */ int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci, const u8 *raw_key) { ci->ci_owns_key = true; return fscrypt_prepare_key(&ci->ci_enc_key, raw_key, ci); } static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci, struct fscrypt_master_key *mk, struct fscrypt_prepared_key *keys, u8 hkdf_context, bool include_fs_uuid) { const struct inode *inode = ci->ci_inode; const struct super_block *sb = inode->i_sb; struct fscrypt_mode *mode = ci->ci_mode; const u8 mode_num = mode - fscrypt_modes; struct fscrypt_prepared_key *prep_key; u8 mode_key[FSCRYPT_MAX_KEY_SIZE]; u8 hkdf_info[sizeof(mode_num) + sizeof(sb->s_uuid)]; unsigned int hkdf_infolen = 0; int err; if (WARN_ON_ONCE(mode_num > FSCRYPT_MODE_MAX)) return -EINVAL; prep_key = &keys[mode_num]; if (fscrypt_is_key_prepared(prep_key, ci)) { ci->ci_enc_key = *prep_key; return 0; } mutex_lock(&fscrypt_mode_key_setup_mutex); if (fscrypt_is_key_prepared(prep_key, ci)) goto done_unlock; BUILD_BUG_ON(sizeof(mode_num) != 1); BUILD_BUG_ON(sizeof(sb->s_uuid) != 16); BUILD_BUG_ON(sizeof(hkdf_info) != 17); hkdf_info[hkdf_infolen++] = mode_num; if (include_fs_uuid) { memcpy(&hkdf_info[hkdf_infolen], &sb->s_uuid, sizeof(sb->s_uuid)); hkdf_infolen += sizeof(sb->s_uuid); } err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, hkdf_context, hkdf_info, hkdf_infolen, mode_key, mode->keysize); if (err) goto out_unlock; err = fscrypt_prepare_key(prep_key, mode_key, ci); memzero_explicit(mode_key, mode->keysize); if (err) goto out_unlock; done_unlock: ci->ci_enc_key = *prep_key; err = 0; out_unlock: mutex_unlock(&fscrypt_mode_key_setup_mutex); return err; } /* * Derive a SipHash key from the given fscrypt master key and the given * application-specific information string. * * Note that the KDF produces a byte array, but the SipHash APIs expect the key * as a pair of 64-bit words. Therefore, on big endian CPUs we have to do an * endianness swap in order to get the same results as on little endian CPUs. */ static int fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk, u8 context, const u8 *info, unsigned int infolen, siphash_key_t *key) { int err; err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, context, info, infolen, (u8 *)key, sizeof(*key)); if (err) return err; BUILD_BUG_ON(sizeof(*key) != 16); BUILD_BUG_ON(ARRAY_SIZE(key->key) != 2); le64_to_cpus(&key->key[0]); le64_to_cpus(&key->key[1]); return 0; } int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk) { int err; err = fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_DIRHASH_KEY, ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, &ci->ci_dirhash_key); if (err) return err; ci->ci_dirhash_key_initialized = true; return 0; } void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk) { WARN_ON_ONCE(ci->ci_inode->i_ino == 0); WARN_ON_ONCE(!mk->mk_ino_hash_key_initialized); ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino, &mk->mk_ino_hash_key); } static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_inode_info *ci, struct fscrypt_master_key *mk) { int err; err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_32_keys, HKDF_CONTEXT_IV_INO_LBLK_32_KEY, true); if (err) return err; /* pairs with smp_store_release() below */ if (!smp_load_acquire(&mk->mk_ino_hash_key_initialized)) { mutex_lock(&fscrypt_mode_key_setup_mutex); if (mk->mk_ino_hash_key_initialized) goto unlock; err = fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_INODE_HASH_KEY, NULL, 0, &mk->mk_ino_hash_key); if (err) goto unlock; /* pairs with smp_load_acquire() above */ smp_store_release(&mk->mk_ino_hash_key_initialized, true); unlock: mutex_unlock(&fscrypt_mode_key_setup_mutex); if (err) return err; } /* * New inodes may not have an inode number assigned yet. * Hashing their inode number is delayed until later. */ if (ci->ci_inode->i_ino) fscrypt_hash_inode_number(ci, mk); return 0; } static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci, struct fscrypt_master_key *mk, bool need_dirhash_key) { int err; if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) { /* * DIRECT_KEY: instead of deriving per-file encryption keys, the * per-file nonce will be included in all the IVs. But unlike * v1 policies, for v2 policies in this case we don't encrypt * with the master key directly but rather derive a per-mode * encryption key. This ensures that the master key is * consistently used only for HKDF, avoiding key reuse issues. */ err = setup_per_mode_enc_key(ci, mk, mk->mk_direct_keys, HKDF_CONTEXT_DIRECT_KEY, false); } else if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) { /* * IV_INO_LBLK_64: encryption keys are derived from (master_key, * mode_num, filesystem_uuid), and inode number is included in * the IVs. This format is optimized for use with inline * encryption hardware compliant with the UFS standard. */ err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_64_keys, HKDF_CONTEXT_IV_INO_LBLK_64_KEY, true); } else if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) { err = fscrypt_setup_iv_ino_lblk_32_key(ci, mk); } else { u8 derived_key[FSCRYPT_MAX_KEY_SIZE]; err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, HKDF_CONTEXT_PER_FILE_ENC_KEY, ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, derived_key, ci->ci_mode->keysize); if (err) return err; err = fscrypt_set_per_file_enc_key(ci, derived_key); memzero_explicit(derived_key, ci->ci_mode->keysize); } if (err) return err; /* Derive a secret dirhash key for directories that need it. */ if (need_dirhash_key) { err = fscrypt_derive_dirhash_key(ci, mk); if (err) return err; } return 0; } /* * Check whether the size of the given master key (@mk) is appropriate for the * encryption settings which a particular file will use (@ci). * * If the file uses a v1 encryption policy, then the master key must be at least * as long as the derived key, as this is a requirement of the v1 KDF. * * Otherwise, the KDF can accept any size key, so we enforce a slightly looser * requirement: we require that the size of the master key be at least the * maximum security strength of any algorithm whose key will be derived from it * (but in practice we only need to consider @ci->ci_mode, since any other * possible subkeys such as DIRHASH and INODE_HASH will never increase the * required key size over @ci->ci_mode). This allows AES-256-XTS keys to be * derived from a 256-bit master key, which is cryptographically sufficient, * rather than requiring a 512-bit master key which is unnecessarily long. (We * still allow 512-bit master keys if the user chooses to use them, though.) */ static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk, const struct fscrypt_inode_info *ci) { unsigned int min_keysize; if (ci->ci_policy.version == FSCRYPT_POLICY_V1) min_keysize = ci->ci_mode->keysize; else min_keysize = ci->ci_mode->security_strength; if (mk->mk_secret.size < min_keysize) { fscrypt_warn(NULL, "key with %s %*phN is too short (got %u bytes, need %u+ bytes)", master_key_spec_type(&mk->mk_spec), master_key_spec_len(&mk->mk_spec), (u8 *)&mk->mk_spec.u, mk->mk_secret.size, min_keysize); return false; } return true; } /* * Find the master key, then set up the inode's actual encryption key. * * If the master key is found in the filesystem-level keyring, then it is * returned in *mk_ret with its semaphore read-locked. This is needed to ensure * that only one task links the fscrypt_inode_info into ->mk_decrypted_inodes * (as multiple tasks may race to create an fscrypt_inode_info for the same * inode), and to synchronize the master key being removed with a new inode * starting to use it. */ static int setup_file_encryption_key(struct fscrypt_inode_info *ci, bool need_dirhash_key, struct fscrypt_master_key **mk_ret) { struct super_block *sb = ci->ci_inode->i_sb; struct fscrypt_key_specifier mk_spec; struct fscrypt_master_key *mk; int err; err = fscrypt_select_encryption_impl(ci); if (err) return err; err = fscrypt_policy_to_key_spec(&ci->ci_policy, &mk_spec); if (err) return err; mk = fscrypt_find_master_key(sb, &mk_spec); if (unlikely(!mk)) { const union fscrypt_policy *dummy_policy = fscrypt_get_dummy_policy(sb); /* * Add the test_dummy_encryption key on-demand. In principle, * it should be added at mount time. Do it here instead so that * the individual filesystems don't need to worry about adding * this key at mount time and cleaning up on mount failure. */ if (dummy_policy && fscrypt_policies_equal(dummy_policy, &ci->ci_policy)) { err = fscrypt_add_test_dummy_key(sb, &mk_spec); if (err) return err; mk = fscrypt_find_master_key(sb, &mk_spec); } } if (unlikely(!mk)) { if (ci->ci_policy.version != FSCRYPT_POLICY_V1) return -ENOKEY; /* * As a legacy fallback for v1 policies, search for the key in * the current task's subscribed keyrings too. Don't move this * to before the search of ->s_master_keys, since users * shouldn't be able to override filesystem-level keys. */ return fscrypt_setup_v1_file_key_via_subscribed_keyrings(ci); } down_read(&mk->mk_sem); if (!mk->mk_present) { /* FS_IOC_REMOVE_ENCRYPTION_KEY has been executed on this key */ err = -ENOKEY; goto out_release_key; } if (!fscrypt_valid_master_key_size(mk, ci)) { err = -ENOKEY; goto out_release_key; } switch (ci->ci_policy.version) { case FSCRYPT_POLICY_V1: err = fscrypt_setup_v1_file_key(ci, mk->mk_secret.raw); break; case FSCRYPT_POLICY_V2: err = fscrypt_setup_v2_file_key(ci, mk, need_dirhash_key); break; default: WARN_ON_ONCE(1); err = -EINVAL; break; } if (err) goto out_release_key; *mk_ret = mk; return 0; out_release_key: up_read(&mk->mk_sem); fscrypt_put_master_key(mk); return err; } static void put_crypt_info(struct fscrypt_inode_info *ci) { struct fscrypt_master_key *mk; if (!ci) return; if (ci->ci_direct_key) fscrypt_put_direct_key(ci->ci_direct_key); else if (ci->ci_owns_key) fscrypt_destroy_prepared_key(ci->ci_inode->i_sb, &ci->ci_enc_key); mk = ci->ci_master_key; if (mk) { /* * Remove this inode from the list of inodes that were unlocked * with the master key. In addition, if we're removing the last * inode from an incompletely removed key, then complete the * full removal of the key. */ spin_lock(&mk->mk_decrypted_inodes_lock); list_del(&ci->ci_master_key_link); spin_unlock(&mk->mk_decrypted_inodes_lock); fscrypt_put_master_key_activeref(ci->ci_inode->i_sb, mk); } memzero_explicit(ci, sizeof(*ci)); kmem_cache_free(fscrypt_inode_info_cachep, ci); } static int fscrypt_setup_encryption_info(struct inode *inode, const union fscrypt_policy *policy, const u8 nonce[FSCRYPT_FILE_NONCE_SIZE], bool need_dirhash_key) { struct fscrypt_inode_info *crypt_info; struct fscrypt_mode *mode; struct fscrypt_master_key *mk = NULL; int res; res = fscrypt_initialize(inode->i_sb); if (res) return res; crypt_info = kmem_cache_zalloc(fscrypt_inode_info_cachep, GFP_KERNEL); if (!crypt_info) return -ENOMEM; crypt_info->ci_inode = inode; crypt_info->ci_policy = *policy; memcpy(crypt_info->ci_nonce, nonce, FSCRYPT_FILE_NONCE_SIZE); mode = select_encryption_mode(&crypt_info->ci_policy, inode); if (IS_ERR(mode)) { res = PTR_ERR(mode); goto out; } WARN_ON_ONCE(mode->ivsize > FSCRYPT_MAX_IV_SIZE); crypt_info->ci_mode = mode; crypt_info->ci_data_unit_bits = fscrypt_policy_du_bits(&crypt_info->ci_policy, inode); crypt_info->ci_data_units_per_block_bits = inode->i_blkbits - crypt_info->ci_data_unit_bits; res = setup_file_encryption_key(crypt_info, need_dirhash_key, &mk); if (res) goto out; /* * For existing inodes, multiple tasks may race to set ->i_crypt_info. * So use cmpxchg_release(). This pairs with the smp_load_acquire() in * fscrypt_get_inode_info(). I.e., here we publish ->i_crypt_info with * a RELEASE barrier so that other tasks can ACQUIRE it. */ if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) { /* * We won the race and set ->i_crypt_info to our crypt_info. * Now link it into the master key's inode list. */ if (mk) { crypt_info->ci_master_key = mk; refcount_inc(&mk->mk_active_refs); spin_lock(&mk->mk_decrypted_inodes_lock); list_add(&crypt_info->ci_master_key_link, &mk->mk_decrypted_inodes); spin_unlock(&mk->mk_decrypted_inodes_lock); } crypt_info = NULL; } res = 0; out: if (mk) { up_read(&mk->mk_sem); fscrypt_put_master_key(mk); } put_crypt_info(crypt_info); return res; } /** * fscrypt_get_encryption_info() - set up an inode's encryption key * @inode: the inode to set up the key for. Must be encrypted. * @allow_unsupported: if %true, treat an unsupported encryption policy (or * unrecognized encryption context) the same way as the key * being unavailable, instead of returning an error. Use * %false unless the operation being performed is needed in * order for files (or directories) to be deleted. * * Set up ->i_crypt_info, if it hasn't already been done. * * Note: unless ->i_crypt_info is already set, this isn't %GFP_NOFS-safe. So * generally this shouldn't be called from within a filesystem transaction. * * Return: 0 if ->i_crypt_info was set or was already set, *or* if the * encryption key is unavailable. (Use fscrypt_has_encryption_key() to * distinguish these cases.) Also can return another -errno code. */ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) { int res; union fscrypt_context ctx; union fscrypt_policy policy; if (fscrypt_has_encryption_key(inode)) return 0; res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { if (res == -ERANGE && allow_unsupported) return 0; fscrypt_warn(inode, "Error %d getting encryption context", res); return res; } res = fscrypt_policy_from_context(&policy, &ctx, res); if (res) { if (allow_unsupported) return 0; fscrypt_warn(inode, "Unrecognized or corrupt encryption context"); return res; } if (!fscrypt_supported_policy(&policy, inode)) { if (allow_unsupported) return 0; return -EINVAL; } res = fscrypt_setup_encryption_info(inode, &policy, fscrypt_context_nonce(&ctx), IS_CASEFOLDED(inode) && S_ISDIR(inode->i_mode)); if (res == -ENOPKG && allow_unsupported) /* Algorithm unavailable? */ res = 0; if (res == -ENOKEY) res = 0; return res; } /** * fscrypt_prepare_new_inode() - prepare to create a new inode in a directory * @dir: a possibly-encrypted directory * @inode: the new inode. ->i_mode must be set already. * ->i_ino doesn't need to be set yet. * @encrypt_ret: (output) set to %true if the new inode will be encrypted * * If the directory is encrypted, set up its ->i_crypt_info in preparation for * encrypting the name of the new file. Also, if the new inode will be * encrypted, set up its ->i_crypt_info and set *encrypt_ret=true. * * This isn't %GFP_NOFS-safe, and therefore it should be called before starting * any filesystem transaction to create the inode. For this reason, ->i_ino * isn't required to be set yet, as the filesystem may not have set it yet. * * This doesn't persist the new inode's encryption context. That still needs to * be done later by calling fscrypt_set_context(). * * Return: 0 on success, -ENOKEY if the encryption key is missing, or another * -errno code */ int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, bool *encrypt_ret) { const union fscrypt_policy *policy; u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; policy = fscrypt_policy_to_inherit(dir); if (policy == NULL) return 0; if (IS_ERR(policy)) return PTR_ERR(policy); if (WARN_ON_ONCE(inode->i_mode == 0)) return -EINVAL; /* * Only regular files, directories, and symlinks are encrypted. * Special files like device nodes and named pipes aren't. */ if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) && !S_ISLNK(inode->i_mode)) return 0; *encrypt_ret = true; get_random_bytes(nonce, FSCRYPT_FILE_NONCE_SIZE); return fscrypt_setup_encryption_info(inode, policy, nonce, IS_CASEFOLDED(dir) && S_ISDIR(inode->i_mode)); } EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode); /** * fscrypt_put_encryption_info() - free most of an inode's fscrypt data * @inode: an inode being evicted * * Free the inode's fscrypt_inode_info. Filesystems must call this when the * inode is being evicted. An RCU grace period need not have elapsed yet. */ void fscrypt_put_encryption_info(struct inode *inode) { put_crypt_info(inode->i_crypt_info); inode->i_crypt_info = NULL; } EXPORT_SYMBOL(fscrypt_put_encryption_info); /** * fscrypt_free_inode() - free an inode's fscrypt data requiring RCU delay * @inode: an inode being freed * * Free the inode's cached decrypted symlink target, if any. Filesystems must * call this after an RCU grace period, just before they free the inode. */ void fscrypt_free_inode(struct inode *inode) { if (IS_ENCRYPTED(inode) && S_ISLNK(inode->i_mode)) { kfree(inode->i_link); inode->i_link = NULL; } } EXPORT_SYMBOL(fscrypt_free_inode); /** * fscrypt_drop_inode() - check whether the inode's master key has been removed * @inode: an inode being considered for eviction * * Filesystems supporting fscrypt must call this from their ->drop_inode() * method so that encrypted inodes are evicted as soon as they're no longer in * use and their master key has been removed. * * Return: 1 if fscrypt wants the inode to be evicted now, otherwise 0 */ int fscrypt_drop_inode(struct inode *inode) { const struct fscrypt_inode_info *ci = fscrypt_get_inode_info(inode); /* * If ci is NULL, then the inode doesn't have an encryption key set up * so it's irrelevant. If ci_master_key is NULL, then the master key * was provided via the legacy mechanism of the process-subscribed * keyrings, so we don't know whether it's been removed or not. */ if (!ci || !ci->ci_master_key) return 0; /* * With proper, non-racy use of FS_IOC_REMOVE_ENCRYPTION_KEY, all inodes * protected by the key were cleaned by sync_filesystem(). But if * userspace is still using the files, inodes can be dirtied between * then and now. We mustn't lose any writes, so skip dirty inodes here. */ if (inode->i_state & I_DIRTY_ALL) return 0; /* * We can't take ->mk_sem here, since this runs in atomic context. * Therefore, ->mk_present can change concurrently, and our result may * immediately become outdated. But there's no correctness problem with * unnecessarily evicting. Nor is there a correctness problem with not * evicting while iput() is racing with the key being removed, since * then the thread removing the key will either evict the inode itself * or will correctly detect that it wasn't evicted due to the race. */ return !READ_ONCE(ci->ci_master_key->mk_present); } EXPORT_SYMBOL_GPL(fscrypt_drop_inode);
21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 // SPDX-License-Identifier: GPL-2.0 #include <linux/capability.h> #include <linux/fs.h> #include <linux/posix_acl.h> #include "reiserfs.h" #include <linux/errno.h> #include <linux/pagemap.h> #include <linux/xattr.h> #include <linux/slab.h> #include <linux/posix_acl_xattr.h> #include "xattr.h" #include "acl.h" #include <linux/uaccess.h> static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, int type, struct posix_acl *acl); int reiserfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int error, error2; struct reiserfs_transaction_handle th; size_t jcreate_blocks; int size = acl ? posix_acl_xattr_size(acl->a_count) : 0; int update_mode = 0; struct inode *inode = d_inode(dentry); umode_t mode = inode->i_mode; /* * Pessimism: We can't assume that anything from the xattr root up * has been created. */ jcreate_blocks = reiserfs_xattr_jcreate_nblocks(inode) + reiserfs_xattr_nblocks(inode, size) * 2; reiserfs_write_lock(inode->i_sb); error = journal_begin(&th, inode->i_sb, jcreate_blocks); reiserfs_write_unlock(inode->i_sb); if (error == 0) { if (type == ACL_TYPE_ACCESS && acl) { error = posix_acl_update_mode(&nop_mnt_idmap, inode, &mode, &acl); if (error) goto unlock; update_mode = 1; } error = __reiserfs_set_acl(&th, inode, type, acl); if (!error && update_mode) inode->i_mode = mode; unlock: reiserfs_write_lock(inode->i_sb); error2 = journal_end(&th); reiserfs_write_unlock(inode->i_sb); if (error2) error = error2; } return error; } /* * Convert from filesystem to in-memory representation. */ static struct posix_acl *reiserfs_posix_acl_from_disk(const void *value, size_t size) { const char *end = (char *)value + size; int n, count; struct posix_acl *acl; if (!value) return NULL; if (size < sizeof(reiserfs_acl_header)) return ERR_PTR(-EINVAL); if (((reiserfs_acl_header *) value)->a_version != cpu_to_le32(REISERFS_ACL_VERSION)) return ERR_PTR(-EINVAL); value = (char *)value + sizeof(reiserfs_acl_header); count = reiserfs_acl_count(size); if (count < 0) return ERR_PTR(-EINVAL); if (count == 0) return NULL; acl = posix_acl_alloc(count, GFP_NOFS); if (!acl) return ERR_PTR(-ENOMEM); for (n = 0; n < count; n++) { reiserfs_acl_entry *entry = (reiserfs_acl_entry *) value; if ((char *)value + sizeof(reiserfs_acl_entry_short) > end) goto fail; acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); switch (acl->a_entries[n].e_tag) { case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: value = (char *)value + sizeof(reiserfs_acl_entry_short); break; case ACL_USER: value = (char *)value + sizeof(reiserfs_acl_entry); if ((char *)value > end) goto fail; acl->a_entries[n].e_uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); break; case ACL_GROUP: value = (char *)value + sizeof(reiserfs_acl_entry); if ((char *)value > end) goto fail; acl->a_entries[n].e_gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); break; default: goto fail; } } if (value != end) goto fail; return acl; fail: posix_acl_release(acl); return ERR_PTR(-EINVAL); } /* * Convert from in-memory to filesystem representation. */ static void *reiserfs_posix_acl_to_disk(const struct posix_acl *acl, size_t * size) { reiserfs_acl_header *ext_acl; char *e; int n; *size = reiserfs_acl_size(acl->a_count); ext_acl = kmalloc(sizeof(reiserfs_acl_header) + acl->a_count * sizeof(reiserfs_acl_entry), GFP_NOFS); if (!ext_acl) return ERR_PTR(-ENOMEM); ext_acl->a_version = cpu_to_le32(REISERFS_ACL_VERSION); e = (char *)ext_acl + sizeof(reiserfs_acl_header); for (n = 0; n < acl->a_count; n++) { const struct posix_acl_entry *acl_e = &acl->a_entries[n]; reiserfs_acl_entry *entry = (reiserfs_acl_entry *) e; entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); switch (acl->a_entries[n].e_tag) { case ACL_USER: entry->e_id = cpu_to_le32( from_kuid(&init_user_ns, acl_e->e_uid)); e += sizeof(reiserfs_acl_entry); break; case ACL_GROUP: entry->e_id = cpu_to_le32( from_kgid(&init_user_ns, acl_e->e_gid)); e += sizeof(reiserfs_acl_entry); break; case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: e += sizeof(reiserfs_acl_entry_short); break; default: goto fail; } } return (char *)ext_acl; fail: kfree(ext_acl); return ERR_PTR(-EINVAL); } /* * Inode operation get_posix_acl(). * * inode->i_mutex: down * BKL held [before 2.5.x] */ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu) { char *name, *value; struct posix_acl *acl; int size; int retval; if (rcu) return ERR_PTR(-ECHILD); switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: BUG(); } size = reiserfs_xattr_get(inode, name, NULL, 0); if (size < 0) { if (size == -ENODATA || size == -ENOSYS) return NULL; return ERR_PTR(size); } value = kmalloc(size, GFP_NOFS); if (!value) return ERR_PTR(-ENOMEM); retval = reiserfs_xattr_get(inode, name, value, size); if (retval == -ENODATA || retval == -ENOSYS) { /* * This shouldn't actually happen as it should have * been caught above.. but just in case */ acl = NULL; } else if (retval < 0) { acl = ERR_PTR(retval); } else { acl = reiserfs_posix_acl_from_disk(value, retval); } kfree(value); return acl; } /* * Inode operation set_posix_acl(). * * inode->i_mutex: down * BKL held [before 2.5.x] */ static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, int type, struct posix_acl *acl) { char *name; void *value = NULL; size_t size = 0; int error; switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: name = XATTR_NAME_POSIX_ACL_DEFAULT; if (!S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; break; default: return -EINVAL; } if (acl) { value = reiserfs_posix_acl_to_disk(acl, &size); if (IS_ERR(value)) return (int)PTR_ERR(value); } error = reiserfs_xattr_set_handle(th, inode, name, value, size, 0); /* * Ensure that the inode gets dirtied if we're only using * the mode bits and an old ACL didn't exist. We don't need * to check if the inode is hashed here since we won't get * called by reiserfs_inherit_default_acl(). */ if (error == -ENODATA) { error = 0; if (type == ACL_TYPE_ACCESS) { inode_set_ctime_current(inode); mark_inode_dirty(inode); } } kfree(value); if (!error) set_cached_acl(inode, type, acl); return error; } /* * dir->i_mutex: locked, * inode is new and not released into the wild yet */ int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, struct inode *dir, struct dentry *dentry, struct inode *inode) { struct posix_acl *default_acl, *acl; int err = 0; /* ACLs only get applied to files and directories */ if (S_ISLNK(inode->i_mode)) return 0; /* * ACLs can only be used on "new" objects, so if it's an old object * there is nothing to inherit from */ if (get_inode_sd_version(dir) == STAT_DATA_V1) goto apply_umask; /* * Don't apply ACLs to objects in the .reiserfs_priv tree.. This * would be useless since permissions are ignored, and a pain because * it introduces locking cycles */ if (IS_PRIVATE(inode)) goto apply_umask; err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); if (err) return err; if (default_acl) { err = __reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT, default_acl); posix_acl_release(default_acl); } if (acl) { if (!err) err = __reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS, acl); posix_acl_release(acl); } return err; apply_umask: /* no ACL, apply umask */ inode->i_mode &= ~current_umask(); return err; } /* This is used to cache the default acl before a new object is created. * The biggest reason for this is to get an idea of how many blocks will * actually be required for the create operation if we must inherit an ACL. * An ACL write can add up to 3 object creations and an additional file write * so we'd prefer not to reserve that many blocks in the journal if we can. * It also has the advantage of not loading the ACL with a transaction open, * this may seem silly, but if the owner of the directory is doing the * creation, the ACL may not be loaded since the permissions wouldn't require * it. * We return the number of blocks required for the transaction. */ int reiserfs_cache_default_acl(struct inode *inode) { struct posix_acl *acl; int nblocks = 0; if (IS_PRIVATE(inode)) return 0; acl = get_inode_acl(inode, ACL_TYPE_DEFAULT); if (acl && !IS_ERR(acl)) { int size = reiserfs_acl_size(acl->a_count); /* Other xattrs can be created during inode creation. We don't * want to claim too many blocks, so we check to see if we * need to create the tree to the xattrs, and then we * just want two files. */ nblocks = reiserfs_xattr_jcreate_nblocks(inode); nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb); REISERFS_I(inode)->i_flags |= i_has_xattr_dir; /* We need to account for writes + bitmaps for two files */ nblocks += reiserfs_xattr_nblocks(inode, size) * 4; posix_acl_release(acl); } return nblocks; } /* * Called under i_mutex */ int reiserfs_acl_chmod(struct dentry *dentry) { struct inode *inode = d_inode(dentry); if (IS_PRIVATE(inode)) return 0; if (get_inode_sd_version(inode) == STAT_DATA_V1 || !reiserfs_posixacl(inode->i_sb)) return 0; return posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode); }
952 952 954 97 952 874 1 944 5 79 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 // SPDX-License-Identifier: GPL-2.0-or-later /* * SHA-256, as specified in * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf * * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>. * * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2014 Red Hat Inc. */ #include <asm/unaligned.h> #include <crypto/sha256_base.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/string.h> static const u32 SHA256_K[] = { 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, }; static inline u32 Ch(u32 x, u32 y, u32 z) { return z ^ (x & (y ^ z)); } static inline u32 Maj(u32 x, u32 y, u32 z) { return (x & y) | (z & (x | y)); } #define e0(x) (ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22)) #define e1(x) (ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25)) #define s0(x) (ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3)) #define s1(x) (ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10)) static inline void LOAD_OP(int I, u32 *W, const u8 *input) { W[I] = get_unaligned_be32((__u32 *)input + I); } static inline void BLEND_OP(int I, u32 *W) { W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16]; } #define SHA256_ROUND(i, a, b, c, d, e, f, g, h) do { \ u32 t1, t2; \ t1 = h + e1(e) + Ch(e, f, g) + SHA256_K[i] + W[i]; \ t2 = e0(a) + Maj(a, b, c); \ d += t1; \ h = t1 + t2; \ } while (0) static void sha256_transform(u32 *state, const u8 *input, u32 *W) { u32 a, b, c, d, e, f, g, h; int i; /* load the input */ for (i = 0; i < 16; i += 8) { LOAD_OP(i + 0, W, input); LOAD_OP(i + 1, W, input); LOAD_OP(i + 2, W, input); LOAD_OP(i + 3, W, input); LOAD_OP(i + 4, W, input); LOAD_OP(i + 5, W, input); LOAD_OP(i + 6, W, input); LOAD_OP(i + 7, W, input); } /* now blend */ for (i = 16; i < 64; i += 8) { BLEND_OP(i + 0, W); BLEND_OP(i + 1, W); BLEND_OP(i + 2, W); BLEND_OP(i + 3, W); BLEND_OP(i + 4, W); BLEND_OP(i + 5, W); BLEND_OP(i + 6, W); BLEND_OP(i + 7, W); } /* load the state into our registers */ a = state[0]; b = state[1]; c = state[2]; d = state[3]; e = state[4]; f = state[5]; g = state[6]; h = state[7]; /* now iterate */ for (i = 0; i < 64; i += 8) { SHA256_ROUND(i + 0, a, b, c, d, e, f, g, h); SHA256_ROUND(i + 1, h, a, b, c, d, e, f, g); SHA256_ROUND(i + 2, g, h, a, b, c, d, e, f); SHA256_ROUND(i + 3, f, g, h, a, b, c, d, e); SHA256_ROUND(i + 4, e, f, g, h, a, b, c, d); SHA256_ROUND(i + 5, d, e, f, g, h, a, b, c); SHA256_ROUND(i + 6, c, d, e, f, g, h, a, b); SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a); } state[0] += a; state[1] += b; state[2] += c; state[3] += d; state[4] += e; state[5] += f; state[6] += g; state[7] += h; } static void sha256_transform_blocks(struct sha256_state *sctx, const u8 *input, int blocks) { u32 W[64]; do { sha256_transform(sctx->state, input, W); input += SHA256_BLOCK_SIZE; } while (--blocks); memzero_explicit(W, sizeof(W)); } void sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len) { lib_sha256_base_do_update(sctx, data, len, sha256_transform_blocks); } EXPORT_SYMBOL(sha256_update); static void __sha256_final(struct sha256_state *sctx, u8 *out, int digest_size) { lib_sha256_base_do_finalize(sctx, sha256_transform_blocks); lib_sha256_base_finish(sctx, out, digest_size); } void sha256_final(struct sha256_state *sctx, u8 *out) { __sha256_final(sctx, out, 32); } EXPORT_SYMBOL(sha256_final); void sha224_final(struct sha256_state *sctx, u8 *out) { __sha256_final(sctx, out, 28); } EXPORT_SYMBOL(sha224_final); void sha256(const u8 *data, unsigned int len, u8 *out) { struct sha256_state sctx; sha256_init(&sctx); sha256_update(&sctx, data, len); sha256_final(&sctx, out); } EXPORT_SYMBOL(sha256); MODULE_LICENSE("GPL");
36 36 36 36 35 31 25 31 25 32 24 32 1 33 25 41 41 33 33 40 28 35 35 28 19 36 41 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 // SPDX-License-Identifier: GPL-2.0-or-later /* * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 * Phillip Lougher <phillip@squashfs.org.uk> * * block.c */ /* * This file implements the low-level routines to read and decompress * datablocks and metadata blocks. */ #include <linux/blkdev.h> #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/string.h> #include <linux/bio.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs.h" #include "decompressor.h" #include "page_actor.h" /* * Returns the amount of bytes copied to the page actor. */ static int copy_bio_to_actor(struct bio *bio, struct squashfs_page_actor *actor, int offset, int req_length) { void *actor_addr; struct bvec_iter_all iter_all = {}; struct bio_vec *bvec = bvec_init_iter_all(&iter_all); int copied_bytes = 0; int actor_offset = 0; squashfs_actor_nobuff(actor); actor_addr = squashfs_first_page(actor); if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) return 0; while (copied_bytes < req_length) { int bytes_to_copy = min_t(int, bvec->bv_len - offset, PAGE_SIZE - actor_offset); bytes_to_copy = min_t(int, bytes_to_copy, req_length - copied_bytes); if (!IS_ERR(actor_addr)) memcpy(actor_addr + actor_offset, bvec_virt(bvec) + offset, bytes_to_copy); actor_offset += bytes_to_copy; copied_bytes += bytes_to_copy; offset += bytes_to_copy; if (actor_offset >= PAGE_SIZE) { actor_addr = squashfs_next_page(actor); if (!actor_addr) break; actor_offset = 0; } if (offset >= bvec->bv_len) { if (!bio_next_segment(bio, &iter_all)) break; offset = 0; } } squashfs_finish_page(actor); return copied_bytes; } static int squashfs_bio_read_cached(struct bio *fullbio, struct address_space *cache_mapping, u64 index, int length, u64 read_start, u64 read_end, int page_count) { struct page *head_to_cache = NULL, *tail_to_cache = NULL; struct block_device *bdev = fullbio->bi_bdev; int start_idx = 0, end_idx = 0; struct bvec_iter_all iter_all; struct bio *bio = NULL; struct bio_vec *bv; int idx = 0; int err = 0; bio_for_each_segment_all(bv, fullbio, iter_all) { struct page *page = bv->bv_page; if (page->mapping == cache_mapping) { idx++; continue; } /* * We only use this when the device block size is the same as * the page size, so read_start and read_end cover full pages. * * Compare these to the original required index and length to * only cache pages which were requested partially, since these * are the ones which are likely to be needed when reading * adjacent blocks. */ if (idx == 0 && index != read_start) head_to_cache = page; else if (idx == page_count - 1 && index + length != read_end) tail_to_cache = page; if (!bio || idx != end_idx) { struct bio *new = bio_alloc_clone(bdev, fullbio, GFP_NOIO, &fs_bio_set); if (bio) { bio_trim(bio, start_idx * PAGE_SECTORS, (end_idx - start_idx) * PAGE_SECTORS); bio_chain(bio, new); submit_bio(bio); } bio = new; start_idx = idx; } idx++; end_idx = idx; } if (bio) { bio_trim(bio, start_idx * PAGE_SECTORS, (end_idx - start_idx) * PAGE_SECTORS); err = submit_bio_wait(bio); bio_put(bio); } if (err) return err; if (head_to_cache) { int ret = add_to_page_cache_lru(head_to_cache, cache_mapping, read_start >> PAGE_SHIFT, GFP_NOIO); if (!ret) { SetPageUptodate(head_to_cache); unlock_page(head_to_cache); } } if (tail_to_cache) { int ret = add_to_page_cache_lru(tail_to_cache, cache_mapping, (read_end >> PAGE_SHIFT) - 1, GFP_NOIO); if (!ret) { SetPageUptodate(tail_to_cache); unlock_page(tail_to_cache); } } return 0; } static struct page *squashfs_get_cache_page(struct address_space *mapping, pgoff_t index) { struct page *page; if (!mapping) return NULL; page = find_get_page(mapping, index); if (!page) return NULL; if (!PageUptodate(page)) { put_page(page); return NULL; } return page; } static int squashfs_bio_read(struct super_block *sb, u64 index, int length, struct bio **biop, int *block_offset) { struct squashfs_sb_info *msblk = sb->s_fs_info; struct address_space *cache_mapping = msblk->cache_mapping; const u64 read_start = round_down(index, msblk->devblksize); const sector_t block = read_start >> msblk->devblksize_log2; const u64 read_end = round_up(index + length, msblk->devblksize); const sector_t block_end = read_end >> msblk->devblksize_log2; int offset = read_start - round_down(index, PAGE_SIZE); int total_len = (block_end - block) << msblk->devblksize_log2; const int page_count = DIV_ROUND_UP(total_len + offset, PAGE_SIZE); int error, i; struct bio *bio; bio = bio_kmalloc(page_count, GFP_NOIO); if (!bio) return -ENOMEM; bio_init(bio, sb->s_bdev, bio->bi_inline_vecs, page_count, REQ_OP_READ); bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT); for (i = 0; i < page_count; ++i) { unsigned int len = min_t(unsigned int, PAGE_SIZE - offset, total_len); pgoff_t index = (read_start >> PAGE_SHIFT) + i; struct page *page; page = squashfs_get_cache_page(cache_mapping, index); if (!page) page = alloc_page(GFP_NOIO); if (!page) { error = -ENOMEM; goto out_free_bio; } /* * Use the __ version to avoid merging since we need each page * to be separate when we check for and avoid cached pages. */ __bio_add_page(bio, page, len, offset); offset = 0; total_len -= len; } if (cache_mapping) error = squashfs_bio_read_cached(bio, cache_mapping, index, length, read_start, read_end, page_count); else error = submit_bio_wait(bio); if (error) goto out_free_bio; *biop = bio; *block_offset = index & ((1 << msblk->devblksize_log2) - 1); return 0; out_free_bio: bio_free_pages(bio); bio_uninit(bio); kfree(bio); return error; } /* * Read and decompress a metadata block or datablock. Length is non-zero * if a datablock is being read (the size is stored elsewhere in the * filesystem), otherwise the length is obtained from the first two bytes of * the metadata block. A bit in the length field indicates if the block * is stored uncompressed in the filesystem (usually because compression * generated a larger block - this does occasionally happen with compression * algorithms). */ int squashfs_read_data(struct super_block *sb, u64 index, int length, u64 *next_index, struct squashfs_page_actor *output) { struct squashfs_sb_info *msblk = sb->s_fs_info; struct bio *bio = NULL; int compressed; int res; int offset; if (length) { /* * Datablock. */ compressed = SQUASHFS_COMPRESSED_BLOCK(length); length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length); TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n", index, compressed ? "" : "un", length, output->length); } else { /* * Metadata block. */ const u8 *data; struct bvec_iter_all iter_all = {}; struct bio_vec *bvec = bvec_init_iter_all(&iter_all); if (index + 2 > msblk->bytes_used) { res = -EIO; goto out; } res = squashfs_bio_read(sb, index, 2, &bio, &offset); if (res) goto out; if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) { res = -EIO; goto out_free_bio; } /* Extract the length of the metadata block */ data = bvec_virt(bvec); length = data[offset]; if (offset < bvec->bv_len - 1) { length |= data[offset + 1] << 8; } else { if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) { res = -EIO; goto out_free_bio; } data = bvec_virt(bvec); length |= data[0] << 8; } bio_free_pages(bio); bio_uninit(bio); kfree(bio); compressed = SQUASHFS_COMPRESSED(length); length = SQUASHFS_COMPRESSED_SIZE(length); index += 2; TRACE("Block @ 0x%llx, %scompressed size %d\n", index - 2, compressed ? "" : "un", length); } if (length <= 0 || length > output->length || (index + length) > msblk->bytes_used) { res = -EIO; goto out; } if (next_index) *next_index = index + length; res = squashfs_bio_read(sb, index, length, &bio, &offset); if (res) goto out; if (compressed) { if (!msblk->stream) { res = -EIO; goto out_free_bio; } res = msblk->thread_ops->decompress(msblk, bio, offset, length, output); } else { res = copy_bio_to_actor(bio, output, offset, length); } out_free_bio: bio_free_pages(bio); bio_uninit(bio); kfree(bio); out: if (res < 0) { ERROR("Failed to read block 0x%llx: %d\n", index, res); if (msblk->panic_on_errors) panic("squashfs read failed"); } return res; }
2 2 1 1 2 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2000-2001 Christoph Hellwig. * Copyright (c) 2016 Krzysztof Blaszkowski */ /* * Veritas filesystem driver - superblock related routines. */ #include <linux/init.h> #include <linux/module.h> #include <linux/blkdev.h> #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/stat.h> #include <linux/vfs.h> #include <linux/mount.h> #include "vxfs.h" #include "vxfs_extern.h" #include "vxfs_dir.h" #include "vxfs_inode.h" MODULE_AUTHOR("Christoph Hellwig, Krzysztof Blaszkowski"); MODULE_DESCRIPTION("Veritas Filesystem (VxFS) driver"); MODULE_LICENSE("Dual BSD/GPL"); static struct kmem_cache *vxfs_inode_cachep; /** * vxfs_put_super - free superblock resources * @sbp: VFS superblock. * * Description: * vxfs_put_super frees all resources allocated for @sbp * after the last instance of the filesystem is unmounted. */ static void vxfs_put_super(struct super_block *sbp) { struct vxfs_sb_info *infp = VXFS_SBI(sbp); iput(infp->vsi_fship); iput(infp->vsi_ilist); iput(infp->vsi_stilist); brelse(infp->vsi_bp); kfree(infp); } /** * vxfs_statfs - get filesystem information * @dentry: VFS dentry to locate superblock * @bufp: output buffer * * Description: * vxfs_statfs fills the statfs buffer @bufp with information * about the filesystem described by @dentry. * * Returns: * Zero. * * Locking: * No locks held. * * Notes: * This is everything but complete... */ static int vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp) { struct vxfs_sb_info *infp = VXFS_SBI(dentry->d_sb); struct vxfs_sb *raw_sb = infp->vsi_raw; u64 id = huge_encode_dev(dentry->d_sb->s_bdev->bd_dev); bufp->f_type = VXFS_SUPER_MAGIC; bufp->f_bsize = dentry->d_sb->s_blocksize; bufp->f_blocks = fs32_to_cpu(infp, raw_sb->vs_dsize); bufp->f_bfree = fs32_to_cpu(infp, raw_sb->vs_free); bufp->f_bavail = 0; bufp->f_files = 0; bufp->f_ffree = fs32_to_cpu(infp, raw_sb->vs_ifree); bufp->f_fsid = u64_to_fsid(id); bufp->f_namelen = VXFS_NAMELEN; return 0; } static int vxfs_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); *flags |= SB_RDONLY; return 0; } static struct inode *vxfs_alloc_inode(struct super_block *sb) { struct vxfs_inode_info *vi; vi = alloc_inode_sb(sb, vxfs_inode_cachep, GFP_KERNEL); if (!vi) return NULL; inode_init_once(&vi->vfs_inode); return &vi->vfs_inode; } static void vxfs_free_inode(struct inode *inode) { kmem_cache_free(vxfs_inode_cachep, VXFS_INO(inode)); } static const struct super_operations vxfs_super_ops = { .alloc_inode = vxfs_alloc_inode, .free_inode = vxfs_free_inode, .evict_inode = vxfs_evict_inode, .put_super = vxfs_put_super, .statfs = vxfs_statfs, .remount_fs = vxfs_remount, }; static int vxfs_try_sb_magic(struct super_block *sbp, int silent, unsigned blk, __fs32 magic) { struct buffer_head *bp; struct vxfs_sb *rsbp; struct vxfs_sb_info *infp = VXFS_SBI(sbp); int rc = -ENOMEM; bp = sb_bread(sbp, blk); do { if (!bp || !buffer_mapped(bp)) { if (!silent) { printk(KERN_WARNING "vxfs: unable to read disk superblock at %u\n", blk); } break; } rc = -EINVAL; rsbp = (struct vxfs_sb *)bp->b_data; if (rsbp->vs_magic != magic) { if (!silent) printk(KERN_NOTICE "vxfs: WRONG superblock magic %08x at %u\n", rsbp->vs_magic, blk); break; } rc = 0; infp->vsi_raw = rsbp; infp->vsi_bp = bp; } while (0); if (rc) { infp->vsi_raw = NULL; infp->vsi_bp = NULL; brelse(bp); } return rc; } /** * vxfs_fill_super - read superblock into memory and initialize filesystem * @sbp: VFS superblock (to fill) * @dp: fs private mount data * @silent: do not complain loudly when sth is wrong * * Description: * We are called on the first mount of a filesystem to read the * superblock into memory and do some basic setup. * * Returns: * The superblock on success, else %NULL. * * Locking: * We are under @sbp->s_lock. */ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent) { struct vxfs_sb_info *infp; struct vxfs_sb *rsbp; u_long bsize; struct inode *root; int ret = -EINVAL; u32 j; sbp->s_flags |= SB_RDONLY; infp = kzalloc(sizeof(*infp), GFP_KERNEL); if (!infp) { printk(KERN_WARNING "vxfs: unable to allocate incore superblock\n"); return -ENOMEM; } bsize = sb_min_blocksize(sbp, BLOCK_SIZE); if (!bsize) { printk(KERN_WARNING "vxfs: unable to set blocksize\n"); goto out; } sbp->s_op = &vxfs_super_ops; sbp->s_fs_info = infp; sbp->s_time_min = 0; sbp->s_time_max = U32_MAX; if (!vxfs_try_sb_magic(sbp, silent, 1, (__force __fs32)cpu_to_le32(VXFS_SUPER_MAGIC))) { /* Unixware, x86 */ infp->byte_order = VXFS_BO_LE; } else if (!vxfs_try_sb_magic(sbp, silent, 8, (__force __fs32)cpu_to_be32(VXFS_SUPER_MAGIC))) { /* HP-UX, parisc */ infp->byte_order = VXFS_BO_BE; } else { if (!silent) printk(KERN_NOTICE "vxfs: can't find superblock.\n"); goto out; } rsbp = infp->vsi_raw; j = fs32_to_cpu(infp, rsbp->vs_version); if ((j < 2 || j > 4) && !silent) { printk(KERN_NOTICE "vxfs: unsupported VxFS version (%d)\n", j); goto out; } #ifdef DIAGNOSTIC printk(KERN_DEBUG "vxfs: supported VxFS version (%d)\n", j); printk(KERN_DEBUG "vxfs: blocksize: %d\n", fs32_to_cpu(infp, rsbp->vs_bsize)); #endif sbp->s_magic = fs32_to_cpu(infp, rsbp->vs_magic); infp->vsi_oltext = fs32_to_cpu(infp, rsbp->vs_oltext[0]); infp->vsi_oltsize = fs32_to_cpu(infp, rsbp->vs_oltsize); j = fs32_to_cpu(infp, rsbp->vs_bsize); if (!sb_set_blocksize(sbp, j)) { printk(KERN_WARNING "vxfs: unable to set final block size\n"); goto out; } if (vxfs_read_olt(sbp, bsize)) { printk(KERN_WARNING "vxfs: unable to read olt\n"); goto out; } if (vxfs_read_fshead(sbp)) { printk(KERN_WARNING "vxfs: unable to read fshead\n"); goto out; } root = vxfs_iget(sbp, VXFS_ROOT_INO); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out; } sbp->s_root = d_make_root(root); if (!sbp->s_root) { printk(KERN_WARNING "vxfs: unable to get root dentry.\n"); goto out_free_ilist; } return 0; out_free_ilist: iput(infp->vsi_fship); iput(infp->vsi_ilist); iput(infp->vsi_stilist); out: brelse(infp->vsi_bp); kfree(infp); return ret; } /* * The usual module blurb. */ static struct dentry *vxfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { return mount_bdev(fs_type, flags, dev_name, data, vxfs_fill_super); } static struct file_system_type vxfs_fs_type = { .owner = THIS_MODULE, .name = "vxfs", .mount = vxfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("vxfs"); /* makes mount -t vxfs autoload the module */ MODULE_ALIAS("vxfs"); static int __init vxfs_init(void) { int rv; vxfs_inode_cachep = kmem_cache_create_usercopy("vxfs_inode", sizeof(struct vxfs_inode_info), 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, offsetof(struct vxfs_inode_info, vii_immed.vi_immed), sizeof_field(struct vxfs_inode_info, vii_immed.vi_immed), NULL); if (!vxfs_inode_cachep) return -ENOMEM; rv = register_filesystem(&vxfs_fs_type); if (rv < 0) kmem_cache_destroy(vxfs_inode_cachep); return rv; } static void __exit vxfs_cleanup(void) { unregister_filesystem(&vxfs_fs_type); /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(vxfs_inode_cachep); } module_init(vxfs_init); module_exit(vxfs_cleanup);
84 49 61 24 9 24 9 9 7 3 8 1 7 20 20 48 2 2 1 20 34 21 3 47 51 60 60 65 63 27 6 13 2 6 13 60 65 65 35 35 58 58 64 60 13 64 28 28 12 1 1 1 1 1 1 7 37 27 13 20 10 56 14 19 54 36 33 10 10 3 3 3 27 56 57 27 56 26 3 64 49 45 3 49 1 15 1 1 15 15 15 15 15 6 6 6 6 19 31 31 25 13 12 25 28 26 3 3 46 46 32 46 46 46 46 46 8 1 1 1 52 1 23 1 40 1 33 10 8 1 8 23 15 7 7 14 20 6 19 39 4 2 9 2 15 16 62 42 13 52 49 26 13 47 58 58 17 17 17 29 30 30 2 2 2 2 2 14 2 2 56 25 6 20 30 28 26 16 27 20 3 2 28 9 20 13 52 8 14 2 8 8 4 2 2 3 2 3 2 3 3 2 33 33 33 18 13 12 2 11 13 13 33 36 8 17 29 36 10 1 29 5 15 15 7 8 2 2 8 1 2 6 6 4 39 39 4 2 3 4 13 30 11 6 41 15 33 41 8 6 45 33 20 63 7 34 1 3 9 11 1 10 3 1 10 10 1 12 3 1 48 27 38 3 5 45 27 47 10 2 10 6 4 2 5 10 2 10 73 50 22 10 25 7 31 21 5 2 6 25 1 24 34 10 48 35 35 68 68 67 22 1 22 57 11 11 8 6 2 4 4 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 // SPDX-License-Identifier: GPL-2.0-only /* * Generic hugetlb support. * (C) Nadia Yvette Chambers, April 2004 */ #include <linux/list.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/seq_file.h> #include <linux/sysctl.h> #include <linux/highmem.h> #include <linux/mmu_notifier.h> #include <linux/nodemask.h> #include <linux/pagemap.h> #include <linux/mempolicy.h> #include <linux/compiler.h> #include <linux/cpuset.h> #include <linux/mutex.h> #include <linux/memblock.h> #include <linux/sysfs.h> #include <linux/slab.h> #include <linux/sched/mm.h> #include <linux/mmdebug.h> #include <linux/sched/signal.h> #include <linux/rmap.h> #include <linux/string_helpers.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/jhash.h> #include <linux/numa.h> #include <linux/llist.h> #include <linux/cma.h> #include <linux/migrate.h> #include <linux/nospec.h> #include <linux/delayacct.h> #include <linux/memory.h> #include <linux/mm_inline.h> #include <asm/page.h> #include <asm/pgalloc.h> #include <asm/tlb.h> #include <linux/io.h> #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> #include <linux/node.h> #include <linux/page_owner.h> #include "internal.h" #include "hugetlb_vmemmap.h" int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; #ifdef CONFIG_CMA static struct cma *hugetlb_cma[MAX_NUMNODES]; static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata; static bool hugetlb_cma_folio(struct folio *folio, unsigned int order) { return cma_pages_valid(hugetlb_cma[folio_nid(folio)], &folio->page, 1 << order); } #else static bool hugetlb_cma_folio(struct folio *folio, unsigned int order) { return false; } #endif static unsigned long hugetlb_cma_size __initdata; __initdata LIST_HEAD(huge_boot_pages); /* for command line parsing */ static struct hstate * __initdata parsed_hstate; static unsigned long __initdata default_hstate_max_huge_pages; static bool __initdata parsed_valid_hugepagesz = true; static bool __initdata parsed_default_hugepagesz; static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata; /* * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, * free_huge_pages, and surplus_huge_pages. */ DEFINE_SPINLOCK(hugetlb_lock); /* * Serializes faults on the same logical page. This is used to * prevent spurious OOMs when the hugepage pool is fully utilized. */ static int num_fault_mutexes; struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); static void hugetlb_vma_lock_free(struct vm_area_struct *vma); static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, unsigned long end); static struct resv_map *vma_resv_map(struct vm_area_struct *vma); static inline bool subpool_is_free(struct hugepage_subpool *spool) { if (spool->count) return false; if (spool->max_hpages != -1) return spool->used_hpages == 0; if (spool->min_hpages != -1) return spool->rsv_hpages == spool->min_hpages; return true; } static inline void unlock_or_release_subpool(struct hugepage_subpool *spool, unsigned long irq_flags) { spin_unlock_irqrestore(&spool->lock, irq_flags); /* If no pages are used, and no other handles to the subpool * remain, give up any reservations based on minimum size and * free the subpool */ if (subpool_is_free(spool)) { if (spool->min_hpages != -1) hugetlb_acct_memory(spool->hstate, -spool->min_hpages); kfree(spool); } } struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, long min_hpages) { struct hugepage_subpool *spool; spool = kzalloc(sizeof(*spool), GFP_KERNEL); if (!spool) return NULL; spin_lock_init(&spool->lock); spool->count = 1; spool->max_hpages = max_hpages; spool->hstate = h; spool->min_hpages = min_hpages; if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { kfree(spool); return NULL; } spool->rsv_hpages = min_hpages; return spool; } void hugepage_put_subpool(struct hugepage_subpool *spool) { unsigned long flags; spin_lock_irqsave(&spool->lock, flags); BUG_ON(!spool->count); spool->count--; unlock_or_release_subpool(spool, flags); } /* * Subpool accounting for allocating and reserving pages. * Return -ENOMEM if there are not enough resources to satisfy the * request. Otherwise, return the number of pages by which the * global pools must be adjusted (upward). The returned value may * only be different than the passed value (delta) in the case where * a subpool minimum size must be maintained. */ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, long delta) { long ret = delta; if (!spool) return ret; spin_lock_irq(&spool->lock); if (spool->max_hpages != -1) { /* maximum size accounting */ if ((spool->used_hpages + delta) <= spool->max_hpages) spool->used_hpages += delta; else { ret = -ENOMEM; goto unlock_ret; } } /* minimum size accounting */ if (spool->min_hpages != -1 && spool->rsv_hpages) { if (delta > spool->rsv_hpages) { /* * Asking for more reserves than those already taken on * behalf of subpool. Return difference. */ ret = delta - spool->rsv_hpages; spool->rsv_hpages = 0; } else { ret = 0; /* reserves already accounted for */ spool->rsv_hpages -= delta; } } unlock_ret: spin_unlock_irq(&spool->lock); return ret; } /* * Subpool accounting for freeing and unreserving pages. * Return the number of global page reservations that must be dropped. * The return value may only be different than the passed value (delta) * in the case where a subpool minimum size must be maintained. */ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, long delta) { long ret = delta; unsigned long flags; if (!spool) return delta; spin_lock_irqsave(&spool->lock, flags); if (spool->max_hpages != -1) /* maximum size accounting */ spool->used_hpages -= delta; /* minimum size accounting */ if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) { if (spool->rsv_hpages + delta <= spool->min_hpages) ret = 0; else ret = spool->rsv_hpages + delta - spool->min_hpages; spool->rsv_hpages += delta; if (spool->rsv_hpages > spool->min_hpages) spool->rsv_hpages = spool->min_hpages; } /* * If hugetlbfs_put_super couldn't free spool due to an outstanding * quota reference, free it now. */ unlock_or_release_subpool(spool, flags); return ret; } static inline struct hugepage_subpool *subpool_inode(struct inode *inode) { return HUGETLBFS_SB(inode->i_sb)->spool; } static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) { return subpool_inode(file_inode(vma->vm_file)); } /* * hugetlb vma_lock helper routines */ void hugetlb_vma_lock_read(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; down_read(&vma_lock->rw_sema); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); down_read(&resv_map->rw_sema); } } void hugetlb_vma_unlock_read(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; up_read(&vma_lock->rw_sema); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); up_read(&resv_map->rw_sema); } } void hugetlb_vma_lock_write(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; down_write(&vma_lock->rw_sema); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); down_write(&resv_map->rw_sema); } } void hugetlb_vma_unlock_write(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; up_write(&vma_lock->rw_sema); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); up_write(&resv_map->rw_sema); } } int hugetlb_vma_trylock_write(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; return down_write_trylock(&vma_lock->rw_sema); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); return down_write_trylock(&resv_map->rw_sema); } return 1; } void hugetlb_vma_assert_locked(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; lockdep_assert_held(&vma_lock->rw_sema); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); lockdep_assert_held(&resv_map->rw_sema); } } void hugetlb_vma_lock_release(struct kref *kref) { struct hugetlb_vma_lock *vma_lock = container_of(kref, struct hugetlb_vma_lock, refs); kfree(vma_lock); } static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock) { struct vm_area_struct *vma = vma_lock->vma; /* * vma_lock structure may or not be released as a result of put, * it certainly will no longer be attached to vma so clear pointer. * Semaphore synchronizes access to vma_lock->vma field. */ vma_lock->vma = NULL; vma->vm_private_data = NULL; up_write(&vma_lock->rw_sema); kref_put(&vma_lock->refs, hugetlb_vma_lock_release); } static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; __hugetlb_vma_unlock_write_put(vma_lock); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); /* no free for anon vmas, but still need to unlock */ up_write(&resv_map->rw_sema); } } static void hugetlb_vma_lock_free(struct vm_area_struct *vma) { /* * Only present in sharable vmas. */ if (!vma || !__vma_shareable_lock(vma)) return; if (vma->vm_private_data) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; down_write(&vma_lock->rw_sema); __hugetlb_vma_unlock_write_put(vma_lock); } } static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) { struct hugetlb_vma_lock *vma_lock; /* Only establish in (flags) sharable vmas */ if (!vma || !(vma->vm_flags & VM_MAYSHARE)) return; /* Should never get here with non-NULL vm_private_data */ if (vma->vm_private_data) return; vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); if (!vma_lock) { /* * If we can not allocate structure, then vma can not * participate in pmd sharing. This is only a possible * performance enhancement and memory saving issue. * However, the lock is also used to synchronize page * faults with truncation. If the lock is not present, * unlikely races could leave pages in a file past i_size * until the file is removed. Warn in the unlikely case of * allocation failure. */ pr_warn_once("HugeTLB: unable to allocate vma specific lock\n"); return; } kref_init(&vma_lock->refs); init_rwsem(&vma_lock->rw_sema); vma_lock->vma = vma; vma->vm_private_data = vma_lock; } /* Helper that removes a struct file_region from the resv_map cache and returns * it for use. */ static struct file_region * get_file_region_entry_from_cache(struct resv_map *resv, long from, long to) { struct file_region *nrg; VM_BUG_ON(resv->region_cache_count <= 0); resv->region_cache_count--; nrg = list_first_entry(&resv->region_cache, struct file_region, link); list_del(&nrg->link); nrg->from = from; nrg->to = to; return nrg; } static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg, struct file_region *rg) { #ifdef CONFIG_CGROUP_HUGETLB nrg->reservation_counter = rg->reservation_counter; nrg->css = rg->css; if (rg->css) css_get(rg->css); #endif } /* Helper that records hugetlb_cgroup uncharge info. */ static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg, struct hstate *h, struct resv_map *resv, struct file_region *nrg) { #ifdef CONFIG_CGROUP_HUGETLB if (h_cg) { nrg->reservation_counter = &h_cg->rsvd_hugepage[hstate_index(h)]; nrg->css = &h_cg->css; /* * The caller will hold exactly one h_cg->css reference for the * whole contiguous reservation region. But this area might be * scattered when there are already some file_regions reside in * it. As a result, many file_regions may share only one css * reference. In order to ensure that one file_region must hold * exactly one h_cg->css reference, we should do css_get for * each file_region and leave the reference held by caller * untouched. */ css_get(&h_cg->css); if (!resv->pages_per_hpage) resv->pages_per_hpage = pages_per_huge_page(h); /* pages_per_hpage should be the same for all entries in * a resv_map. */ VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h)); } else { nrg->reservation_counter = NULL; nrg->css = NULL; } #endif } static void put_uncharge_info(struct file_region *rg) { #ifdef CONFIG_CGROUP_HUGETLB if (rg->css) css_put(rg->css); #endif } static bool has_same_uncharge_info(struct file_region *rg, struct file_region *org) { #ifdef CONFIG_CGROUP_HUGETLB return rg->reservation_counter == org->reservation_counter && rg->css == org->css; #else return true; #endif } static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) { struct file_region *nrg, *prg; prg = list_prev_entry(rg, link); if (&prg->link != &resv->regions && prg->to == rg->from && has_same_uncharge_info(prg, rg)) { prg->to = rg->to; list_del(&rg->link); put_uncharge_info(rg); kfree(rg); rg = prg; } nrg = list_next_entry(rg, link); if (&nrg->link != &resv->regions && nrg->from == rg->to && has_same_uncharge_info(nrg, rg)) { nrg->from = rg->from; list_del(&rg->link); put_uncharge_info(rg); kfree(rg); } } static inline long hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from, long to, struct hstate *h, struct hugetlb_cgroup *cg, long *regions_needed) { struct file_region *nrg; if (!regions_needed) { nrg = get_file_region_entry_from_cache(map, from, to); record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg); list_add(&nrg->link, rg); coalesce_file_region(map, nrg); } else *regions_needed += 1; return to - from; } /* * Must be called with resv->lock held. * * Calling this with regions_needed != NULL will count the number of pages * to be added but will not modify the linked list. And regions_needed will * indicate the number of file_regions needed in the cache to carry out to add * the regions for this range. */ static long add_reservation_in_range(struct resv_map *resv, long f, long t, struct hugetlb_cgroup *h_cg, struct hstate *h, long *regions_needed) { long add = 0; struct list_head *head = &resv->regions; long last_accounted_offset = f; struct file_region *iter, *trg = NULL; struct list_head *rg = NULL; if (regions_needed) *regions_needed = 0; /* In this loop, we essentially handle an entry for the range * [last_accounted_offset, iter->from), at every iteration, with some * bounds checking. */ list_for_each_entry_safe(iter, trg, head, link) { /* Skip irrelevant regions that start before our range. */ if (iter->from < f) { /* If this region ends after the last accounted offset, * then we need to update last_accounted_offset. */ if (iter->to > last_accounted_offset) last_accounted_offset = iter->to; continue; } /* When we find a region that starts beyond our range, we've * finished. */ if (iter->from >= t) { rg = iter->link.prev; break; } /* Add an entry for last_accounted_offset -> iter->from, and * update last_accounted_offset. */ if (iter->from > last_accounted_offset) add += hugetlb_resv_map_add(resv, iter->link.prev, last_accounted_offset, iter->from, h, h_cg, regions_needed); last_accounted_offset = iter->to; } /* Handle the case where our range extends beyond * last_accounted_offset. */ if (!rg) rg = head->prev; if (last_accounted_offset < t) add += hugetlb_resv_map_add(resv, rg, last_accounted_offset, t, h, h_cg, regions_needed); return add; } /* Must be called with resv->lock acquired. Will drop lock to allocate entries. */ static int allocate_file_region_entries(struct resv_map *resv, int regions_needed) __must_hold(&resv->lock) { LIST_HEAD(allocated_regions); int to_allocate = 0, i = 0; struct file_region *trg = NULL, *rg = NULL; VM_BUG_ON(regions_needed < 0); /* * Check for sufficient descriptors in the cache to accommodate * the number of in progress add operations plus regions_needed. * * This is a while loop because when we drop the lock, some other call * to region_add or region_del may have consumed some region_entries, * so we keep looping here until we finally have enough entries for * (adds_in_progress + regions_needed). */ while (resv->region_cache_count < (resv->adds_in_progress + regions_needed)) { to_allocate = resv->adds_in_progress + regions_needed - resv->region_cache_count; /* At this point, we should have enough entries in the cache * for all the existing adds_in_progress. We should only be * needing to allocate for regions_needed. */ VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress); spin_unlock(&resv->lock); for (i = 0; i < to_allocate; i++) { trg = kmalloc(sizeof(*trg), GFP_KERNEL); if (!trg) goto out_of_memory; list_add(&trg->link, &allocated_regions); } spin_lock(&resv->lock); list_splice(&allocated_regions, &resv->region_cache); resv->region_cache_count += to_allocate; } return 0; out_of_memory: list_for_each_entry_safe(rg, trg, &allocated_regions, link) { list_del(&rg->link); kfree(rg); } return -ENOMEM; } /* * Add the huge page range represented by [f, t) to the reserve * map. Regions will be taken from the cache to fill in this range. * Sufficient regions should exist in the cache due to the previous * call to region_chg with the same range, but in some cases the cache will not * have sufficient entries due to races with other code doing region_add or * region_del. The extra needed entries will be allocated. * * regions_needed is the out value provided by a previous call to region_chg. * * Return the number of new huge pages added to the map. This number is greater * than or equal to zero. If file_region entries needed to be allocated for * this operation and we were not able to allocate, it returns -ENOMEM. * region_add of regions of length 1 never allocate file_regions and cannot * fail; region_chg will always allocate at least 1 entry and a region_add for * 1 page will only require at most 1 entry. */ static long region_add(struct resv_map *resv, long f, long t, long in_regions_needed, struct hstate *h, struct hugetlb_cgroup *h_cg) { long add = 0, actual_regions_needed = 0; spin_lock(&resv->lock); retry: /* Count how many regions are actually needed to execute this add. */ add_reservation_in_range(resv, f, t, NULL, NULL, &actual_regions_needed); /* * Check for sufficient descriptors in the cache to accommodate * this add operation. Note that actual_regions_needed may be greater * than in_regions_needed, as the resv_map may have been modified since * the region_chg call. In this case, we need to make sure that we * allocate extra entries, such that we have enough for all the * existing adds_in_progress, plus the excess needed for this * operation. */ if (actual_regions_needed > in_regions_needed && resv->region_cache_count < resv->adds_in_progress + (actual_regions_needed - in_regions_needed)) { /* region_add operation of range 1 should never need to * allocate file_region entries. */ VM_BUG_ON(t - f <= 1); if (allocate_file_region_entries( resv, actual_regions_needed - in_regions_needed)) { return -ENOMEM; } goto retry; } add = add_reservation_in_range(resv, f, t, h_cg, h, NULL); resv->adds_in_progress -= in_regions_needed; spin_unlock(&resv->lock); return add; } /* * Examine the existing reserve map and determine how many * huge pages in the specified range [f, t) are NOT currently * represented. This routine is called before a subsequent * call to region_add that will actually modify the reserve * map to add the specified range [f, t). region_chg does * not change the number of huge pages represented by the * map. A number of new file_region structures is added to the cache as a * placeholder, for the subsequent region_add call to use. At least 1 * file_region structure is added. * * out_regions_needed is the number of regions added to the * resv->adds_in_progress. This value needs to be provided to a follow up call * to region_add or region_abort for proper accounting. * * Returns the number of huge pages that need to be added to the existing * reservation map for the range [f, t). This number is greater or equal to * zero. -ENOMEM is returned if a new file_region structure or cache entry * is needed and can not be allocated. */ static long region_chg(struct resv_map *resv, long f, long t, long *out_regions_needed) { long chg = 0; spin_lock(&resv->lock); /* Count how many hugepages in this range are NOT represented. */ chg = add_reservation_in_range(resv, f, t, NULL, NULL, out_regions_needed); if (*out_regions_needed == 0) *out_regions_needed = 1; if (allocate_file_region_entries(resv, *out_regions_needed)) return -ENOMEM; resv->adds_in_progress += *out_regions_needed; spin_unlock(&resv->lock); return chg; } /* * Abort the in progress add operation. The adds_in_progress field * of the resv_map keeps track of the operations in progress between * calls to region_chg and region_add. Operations are sometimes * aborted after the call to region_chg. In such cases, region_abort * is called to decrement the adds_in_progress counter. regions_needed * is the value returned by the region_chg call, it is used to decrement * the adds_in_progress counter. * * NOTE: The range arguments [f, t) are not needed or used in this * routine. They are kept to make reading the calling code easier as * arguments will match the associated region_chg call. */ static void region_abort(struct resv_map *resv, long f, long t, long regions_needed) { spin_lock(&resv->lock); VM_BUG_ON(!resv->region_cache_count); resv->adds_in_progress -= regions_needed; spin_unlock(&resv->lock); } /* * Delete the specified range [f, t) from the reserve map. If the * t parameter is LONG_MAX, this indicates that ALL regions after f * should be deleted. Locate the regions which intersect [f, t) * and either trim, delete or split the existing regions. * * Returns the number of huge pages deleted from the reserve map. * In the normal case, the return value is zero or more. In the * case where a region must be split, a new region descriptor must * be allocated. If the allocation fails, -ENOMEM will be returned. * NOTE: If the parameter t == LONG_MAX, then we will never split * a region and possibly return -ENOMEM. Callers specifying * t == LONG_MAX do not need to check for -ENOMEM error. */ static long region_del(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; struct file_region *rg, *trg; struct file_region *nrg = NULL; long del = 0; retry: spin_lock(&resv->lock); list_for_each_entry_safe(rg, trg, head, link) { /* * Skip regions before the range to be deleted. file_region * ranges are normally of the form [from, to). However, there * may be a "placeholder" entry in the map which is of the form * (from, to) with from == to. Check for placeholder entries * at the beginning of the range to be deleted. */ if (rg->to <= f && (rg->to != rg->from || rg->to != f)) continue; if (rg->from >= t) break; if (f > rg->from && t < rg->to) { /* Must split region */ /* * Check for an entry in the cache before dropping * lock and attempting allocation. */ if (!nrg && resv->region_cache_count > resv->adds_in_progress) { nrg = list_first_entry(&resv->region_cache, struct file_region, link); list_del(&nrg->link); resv->region_cache_count--; } if (!nrg) { spin_unlock(&resv->lock); nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); if (!nrg) return -ENOMEM; goto retry; } del += t - f; hugetlb_cgroup_uncharge_file_region( resv, rg, t - f, false); /* New entry for end of split region */ nrg->from = t; nrg->to = rg->to; copy_hugetlb_cgroup_uncharge_info(nrg, rg); INIT_LIST_HEAD(&nrg->link); /* Original entry is trimmed */ rg->to = f; list_add(&nrg->link, &rg->link); nrg = NULL; break; } if (f <= rg->from && t >= rg->to) { /* Remove entire region */ del += rg->to - rg->from; hugetlb_cgroup_uncharge_file_region(resv, rg, rg->to - rg->from, true); list_del(&rg->link); kfree(rg); continue; } if (f <= rg->from) { /* Trim beginning of region */ hugetlb_cgroup_uncharge_file_region(resv, rg, t - rg->from, false); del += t - rg->from; rg->from = t; } else { /* Trim end of region */ hugetlb_cgroup_uncharge_file_region(resv, rg, rg->to - f, false); del += rg->to - f; rg->to = f; } } spin_unlock(&resv->lock); kfree(nrg); return del; } /* * A rare out of memory error was encountered which prevented removal of * the reserve map region for a page. The huge page itself was free'ed * and removed from the page cache. This routine will adjust the subpool * usage count, and the global reserve count if needed. By incrementing * these counts, the reserve map entry which could not be deleted will * appear as a "reserved" entry instead of simply dangling with incorrect * counts. */ void hugetlb_fix_reserve_counts(struct inode *inode) { struct hugepage_subpool *spool = subpool_inode(inode); long rsv_adjust; bool reserved = false; rsv_adjust = hugepage_subpool_get_pages(spool, 1); if (rsv_adjust > 0) { struct hstate *h = hstate_inode(inode); if (!hugetlb_acct_memory(h, 1)) reserved = true; } else if (!rsv_adjust) { reserved = true; } if (!reserved) pr_warn("hugetlb: Huge Page Reserved count may go negative.\n"); } /* * Count and return the number of huge pages in the reserve map * that intersect with the range [f, t). */ static long region_count(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; struct file_region *rg; long chg = 0; spin_lock(&resv->lock); /* Locate each segment we overlap with, and count that overlap. */ list_for_each_entry(rg, head, link) { long seg_from; long seg_to; if (rg->to <= f) continue; if (rg->from >= t) break; seg_from = max(rg->from, f); seg_to = min(rg->to, t); chg += seg_to - seg_from; } spin_unlock(&resv->lock); return chg; } /* * Convert the address within this vma to the page offset within * the mapping, huge page units here. */ static pgoff_t vma_hugecache_offset(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { return ((address - vma->vm_start) >> huge_page_shift(h)) + (vma->vm_pgoff >> huge_page_order(h)); } /** * vma_kernel_pagesize - Page size granularity for this VMA. * @vma: The user mapping. * * Folios in this VMA will be aligned to, and at least the size of the * number of bytes returned by this function. * * Return: The default size of the folios allocated when backing a VMA. */ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) { if (vma->vm_ops && vma->vm_ops->pagesize) return vma->vm_ops->pagesize(vma); return PAGE_SIZE; } EXPORT_SYMBOL_GPL(vma_kernel_pagesize); /* * Return the page size being used by the MMU to back a VMA. In the majority * of cases, the page size used by the kernel matches the MMU size. On * architectures where it differs, an architecture-specific 'strong' * version of this symbol is required. */ __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { return vma_kernel_pagesize(vma); } /* * Flags for MAP_PRIVATE reservations. These are stored in the bottom * bits of the reservation map pointer, which are always clear due to * alignment. */ #define HPAGE_RESV_OWNER (1UL << 0) #define HPAGE_RESV_UNMAPPED (1UL << 1) #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) /* * These helpers are used to track how many pages are reserved for * faults in a MAP_PRIVATE mapping. Only the process that called mmap() * is guaranteed to have their future faults succeed. * * With the exception of hugetlb_dup_vma_private() which is called at fork(), * the reserve counters are updated with the hugetlb_lock held. It is safe * to reset the VMA at fork() time as it is not in use yet and there is no * chance of the global counters getting corrupted as a result of the values. * * The private mapping reservation is represented in a subtly different * manner to a shared mapping. A shared mapping has a region map associated * with the underlying file, this region map represents the backing file * pages which have ever had a reservation assigned which this persists even * after the page is instantiated. A private mapping has a region map * associated with the original mmap which is attached to all VMAs which * reference it, this region map represents those offsets which have consumed * reservation ie. where pages have been instantiated. */ static unsigned long get_vma_private_data(struct vm_area_struct *vma) { return (unsigned long)vma->vm_private_data; } static void set_vma_private_data(struct vm_area_struct *vma, unsigned long value) { vma->vm_private_data = (void *)value; } static void resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map, struct hugetlb_cgroup *h_cg, struct hstate *h) { #ifdef CONFIG_CGROUP_HUGETLB if (!h_cg || !h) { resv_map->reservation_counter = NULL; resv_map->pages_per_hpage = 0; resv_map->css = NULL; } else { resv_map->reservation_counter = &h_cg->rsvd_hugepage[hstate_index(h)]; resv_map->pages_per_hpage = pages_per_huge_page(h); resv_map->css = &h_cg->css; } #endif } struct resv_map *resv_map_alloc(void) { struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL); if (!resv_map || !rg) { kfree(resv_map); kfree(rg); return NULL; } kref_init(&resv_map->refs); spin_lock_init(&resv_map->lock); INIT_LIST_HEAD(&resv_map->regions); init_rwsem(&resv_map->rw_sema); resv_map->adds_in_progress = 0; /* * Initialize these to 0. On shared mappings, 0's here indicate these * fields don't do cgroup accounting. On private mappings, these will be * re-initialized to the proper values, to indicate that hugetlb cgroup * reservations are to be un-charged from here. */ resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL); INIT_LIST_HEAD(&resv_map->region_cache); list_add(&rg->link, &resv_map->region_cache); resv_map->region_cache_count = 1; return resv_map; } void resv_map_release(struct kref *ref) { struct resv_map *resv_map = container_of(ref, struct resv_map, refs); struct list_head *head = &resv_map->region_cache; struct file_region *rg, *trg; /* Clear out any active regions before we release the map. */ region_del(resv_map, 0, LONG_MAX); /* ... and any entries left in the cache */ list_for_each_entry_safe(rg, trg, head, link) { list_del(&rg->link); kfree(rg); } VM_BUG_ON(resv_map->adds_in_progress); kfree(resv_map); } static inline struct resv_map *inode_resv_map(struct inode *inode) { /* * At inode evict time, i_mapping may not point to the original * address space within the inode. This original address space * contains the pointer to the resv_map. So, always use the * address space embedded within the inode. * The VERY common case is inode->mapping == &inode->i_data but, * this may not be true for device special inodes. */ return (struct resv_map *)(&inode->i_data)->i_private_data; } static struct resv_map *vma_resv_map(struct vm_area_struct *vma) { VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); if (vma->vm_flags & VM_MAYSHARE) { struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; return inode_resv_map(inode); } else { return (struct resv_map *)(get_vma_private_data(vma) & ~HPAGE_RESV_MASK); } } static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) { VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); set_vma_private_data(vma, (unsigned long)map); } static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) { VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); set_vma_private_data(vma, get_vma_private_data(vma) | flags); } static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) { VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); return (get_vma_private_data(vma) & flag) != 0; } bool __vma_private_lock(struct vm_area_struct *vma) { return !(vma->vm_flags & VM_MAYSHARE) && get_vma_private_data(vma) & ~HPAGE_RESV_MASK && is_vma_resv_set(vma, HPAGE_RESV_OWNER); } void hugetlb_dup_vma_private(struct vm_area_struct *vma) { VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); /* * Clear vm_private_data * - For shared mappings this is a per-vma semaphore that may be * allocated in a subsequent call to hugetlb_vm_op_open. * Before clearing, make sure pointer is not associated with vma * as this will leak the structure. This is the case when called * via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already * been called to allocate a new structure. * - For MAP_PRIVATE mappings, this is the reserve map which does * not apply to children. Faults generated by the children are * not guaranteed to succeed, even if read-only. */ if (vma->vm_flags & VM_MAYSHARE) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; if (vma_lock && vma_lock->vma != vma) vma->vm_private_data = NULL; } else vma->vm_private_data = NULL; } /* * Reset and decrement one ref on hugepage private reservation. * Called with mm->mmap_lock writer semaphore held. * This function should be only used by move_vma() and operate on * same sized vma. It should never come here with last ref on the * reservation. */ void clear_vma_resv_huge_pages(struct vm_area_struct *vma) { /* * Clear the old hugetlb private page reservation. * It has already been transferred to new_vma. * * During a mremap() operation of a hugetlb vma we call move_vma() * which copies vma into new_vma and unmaps vma. After the copy * operation both new_vma and vma share a reference to the resv_map * struct, and at that point vma is about to be unmapped. We don't * want to return the reservation to the pool at unmap of vma because * the reservation still lives on in new_vma, so simply decrement the * ref here and remove the resv_map reference from this vma. */ struct resv_map *reservations = vma_resv_map(vma); if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { resv_map_put_hugetlb_cgroup_uncharge_info(reservations); kref_put(&reservations->refs, resv_map_release); } hugetlb_dup_vma_private(vma); } /* Returns true if the VMA has associated reserve pages */ static bool vma_has_reserves(struct vm_area_struct *vma, long chg) { if (vma->vm_flags & VM_NORESERVE) { /* * This address is already reserved by other process(chg == 0), * so, we should decrement reserved count. Without decrementing, * reserve count remains after releasing inode, because this * allocated page will go into page cache and is regarded as * coming from reserved pool in releasing step. Currently, we * don't have any other solution to deal with this situation * properly, so add work-around here. */ if (vma->vm_flags & VM_MAYSHARE && chg == 0) return true; else return false; } /* Shared mappings always use reserves */ if (vma->vm_flags & VM_MAYSHARE) { /* * We know VM_NORESERVE is not set. Therefore, there SHOULD * be a region map for all pages. The only situation where * there is no region map is if a hole was punched via * fallocate. In this case, there really are no reserves to * use. This situation is indicated if chg != 0. */ if (chg) return false; else return true; } /* * Only the process that called mmap() has reserves for * private mappings. */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { /* * Like the shared case above, a hole punch or truncate * could have been performed on the private mapping. * Examine the value of chg to determine if reserves * actually exist or were previously consumed. * Very Subtle - The value of chg comes from a previous * call to vma_needs_reserves(). The reserve map for * private mappings has different (opposite) semantics * than that of shared mappings. vma_needs_reserves() * has already taken this difference in semantics into * account. Therefore, the meaning of chg is the same * as in the shared case above. Code could easily be * combined, but keeping it separate draws attention to * subtle differences. */ if (chg) return false; else return true; } return false; } static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio) { int nid = folio_nid(folio); lockdep_assert_held(&hugetlb_lock); VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); list_move(&folio->lru, &h->hugepage_freelists[nid]); h->free_huge_pages++; h->free_huge_pages_node[nid]++; folio_set_hugetlb_freed(folio); } static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h, int nid) { struct folio *folio; bool pin = !!(current->flags & PF_MEMALLOC_PIN); lockdep_assert_held(&hugetlb_lock); list_for_each_entry(folio, &h->hugepage_freelists[nid], lru) { if (pin && !folio_is_longterm_pinnable(folio)) continue; if (folio_test_hwpoison(folio)) continue; list_move(&folio->lru, &h->hugepage_activelist); folio_ref_unfreeze(folio, 1); folio_clear_hugetlb_freed(folio); h->free_huge_pages--; h->free_huge_pages_node[nid]--; return folio; } return NULL; } static struct folio *dequeue_hugetlb_folio_nodemask(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { unsigned int cpuset_mems_cookie; struct zonelist *zonelist; struct zone *zone; struct zoneref *z; int node = NUMA_NO_NODE; zonelist = node_zonelist(nid, gfp_mask); retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) { struct folio *folio; if (!cpuset_zone_allowed(zone, gfp_mask)) continue; /* * no need to ask again on the same node. Pool is node rather than * zone aware */ if (zone_to_nid(zone) == node) continue; node = zone_to_nid(zone); folio = dequeue_hugetlb_folio_node_exact(h, node); if (folio) return folio; } if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return NULL; } static unsigned long available_huge_pages(struct hstate *h) { return h->free_huge_pages - h->resv_huge_pages; } static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve, long chg) { struct folio *folio = NULL; struct mempolicy *mpol; gfp_t gfp_mask; nodemask_t *nodemask; int nid; /* * A child process with MAP_PRIVATE mappings created by their parent * have no page reserves. This check ensures that reservations are * not "stolen". The child may still get SIGKILLed */ if (!vma_has_reserves(vma, chg) && !available_huge_pages(h)) goto err; /* If reserves cannot be used, ensure enough pages are in the pool */ if (avoid_reserve && !available_huge_pages(h)) goto err; gfp_mask = htlb_alloc_mask(h); nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); if (mpol_is_preferred_many(mpol)) { folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, nid, nodemask); /* Fallback to all nodes if page==NULL */ nodemask = NULL; } if (!folio) folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, nid, nodemask); if (folio && !avoid_reserve && vma_has_reserves(vma, chg)) { folio_set_hugetlb_restore_reserve(folio); h->resv_huge_pages--; } mpol_cond_put(mpol); return folio; err: return NULL; } /* * common helper functions for hstate_next_node_to_{alloc|free}. * We may have allocated or freed a huge page based on a different * nodes_allowed previously, so h->next_node_to_{alloc|free} might * be outside of *nodes_allowed. Ensure that we use an allowed * node for alloc or free. */ static int next_node_allowed(int nid, nodemask_t *nodes_allowed) { nid = next_node_in(nid, *nodes_allowed); VM_BUG_ON(nid >= MAX_NUMNODES); return nid; } static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) { if (!node_isset(nid, *nodes_allowed)) nid = next_node_allowed(nid, nodes_allowed); return nid; } /* * returns the previously saved node ["this node"] from which to * allocate a persistent huge page for the pool and advance the * next node from which to allocate, handling wrap at end of node * mask. */ static int hstate_next_node_to_alloc(struct hstate *h, nodemask_t *nodes_allowed) { int nid; VM_BUG_ON(!nodes_allowed); nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); return nid; } /* * helper for remove_pool_hugetlb_folio() - return the previously saved * node ["this node"] from which to free a huge page. Advance the * next node id whether or not we find a free huge page to free so * that the next attempt to free addresses the next node. */ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) { int nid; VM_BUG_ON(!nodes_allowed); nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); return nid; } #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ for (nr_nodes = nodes_weight(*mask); \ nr_nodes > 0 && \ ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ nr_nodes--) #define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ for (nr_nodes = nodes_weight(*mask); \ nr_nodes > 0 && \ ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) /* used to demote non-gigantic_huge pages as well */ static void __destroy_compound_gigantic_folio(struct folio *folio, unsigned int order, bool demote) { int i; int nr_pages = 1 << order; struct page *p; atomic_set(&folio->_entire_mapcount, 0); atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); for (i = 1; i < nr_pages; i++) { p = folio_page(folio, i); p->flags &= ~PAGE_FLAGS_CHECK_AT_FREE; p->mapping = NULL; clear_compound_head(p); if (!demote) set_page_refcounted(p); } __folio_clear_head(folio); } static void destroy_compound_hugetlb_folio_for_demote(struct folio *folio, unsigned int order) { __destroy_compound_gigantic_folio(folio, order, true); } #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE static void destroy_compound_gigantic_folio(struct folio *folio, unsigned int order) { __destroy_compound_gigantic_folio(folio, order, false); } static void free_gigantic_folio(struct folio *folio, unsigned int order) { /* * If the page isn't allocated using the cma allocator, * cma_release() returns false. */ #ifdef CONFIG_CMA int nid = folio_nid(folio); if (cma_release(hugetlb_cma[nid], &folio->page, 1 << order)) return; #endif free_contig_range(folio_pfn(folio), 1 << order); } #ifdef CONFIG_CONTIG_ALLOC static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { struct page *page; unsigned long nr_pages = pages_per_huge_page(h); if (nid == NUMA_NO_NODE) nid = numa_mem_id(); #ifdef CONFIG_CMA { int node; if (hugetlb_cma[nid]) { page = cma_alloc(hugetlb_cma[nid], nr_pages, huge_page_order(h), true); if (page) return page_folio(page); } if (!(gfp_mask & __GFP_THISNODE)) { for_each_node_mask(node, *nodemask) { if (node == nid || !hugetlb_cma[node]) continue; page = cma_alloc(hugetlb_cma[node], nr_pages, huge_page_order(h), true); if (page) return page_folio(page); } } } #endif page = alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); return page ? page_folio(page) : NULL; } #else /* !CONFIG_CONTIG_ALLOC */ static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { return NULL; } #endif /* CONFIG_CONTIG_ALLOC */ #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { return NULL; } static inline void free_gigantic_folio(struct folio *folio, unsigned int order) { } static inline void destroy_compound_gigantic_folio(struct folio *folio, unsigned int order) { } #endif static inline void __clear_hugetlb_destructor(struct hstate *h, struct folio *folio) { lockdep_assert_held(&hugetlb_lock); folio_clear_hugetlb(folio); } /* * Remove hugetlb folio from lists. * If vmemmap exists for the folio, update dtor so that the folio appears * as just a compound page. Otherwise, wait until after allocating vmemmap * to update dtor. * * A reference is held on the folio, except in the case of demote. * * Must be called with hugetlb lock held. */ static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio, bool adjust_surplus, bool demote) { int nid = folio_nid(folio); VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio(folio), folio); VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio); lockdep_assert_held(&hugetlb_lock); if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; list_del(&folio->lru); if (folio_test_hugetlb_freed(folio)) { h->free_huge_pages--; h->free_huge_pages_node[nid]--; } if (adjust_surplus) { h->surplus_huge_pages--; h->surplus_huge_pages_node[nid]--; } /* * We can only clear the hugetlb destructor after allocating vmemmap * pages. Otherwise, someone (memory error handling) may try to write * to tail struct pages. */ if (!folio_test_hugetlb_vmemmap_optimized(folio)) __clear_hugetlb_destructor(h, folio); /* * In the case of demote we do not ref count the page as it will soon * be turned into a page of smaller size. */ if (!demote) folio_ref_unfreeze(folio, 1); h->nr_huge_pages--; h->nr_huge_pages_node[nid]--; } static void remove_hugetlb_folio(struct hstate *h, struct folio *folio, bool adjust_surplus) { __remove_hugetlb_folio(h, folio, adjust_surplus, false); } static void remove_hugetlb_folio_for_demote(struct hstate *h, struct folio *folio, bool adjust_surplus) { __remove_hugetlb_folio(h, folio, adjust_surplus, true); } static void add_hugetlb_folio(struct hstate *h, struct folio *folio, bool adjust_surplus) { int zeroed; int nid = folio_nid(folio); VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio); lockdep_assert_held(&hugetlb_lock); INIT_LIST_HEAD(&folio->lru); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; if (adjust_surplus) { h->surplus_huge_pages++; h->surplus_huge_pages_node[nid]++; } folio_set_hugetlb(folio); folio_change_private(folio, NULL); /* * We have to set hugetlb_vmemmap_optimized again as above * folio_change_private(folio, NULL) cleared it. */ folio_set_hugetlb_vmemmap_optimized(folio); /* * This folio is about to be managed by the hugetlb allocator and * should have no users. Drop our reference, and check for others * just in case. */ zeroed = folio_put_testzero(folio); if (unlikely(!zeroed)) /* * It is VERY unlikely soneone else has taken a ref * on the folio. In this case, we simply return as * free_huge_folio() will be called when this other ref * is dropped. */ return; arch_clear_hugepage_flags(&folio->page); enqueue_hugetlb_folio(h, folio); } static void __update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio) { bool clear_dtor = folio_test_hugetlb_vmemmap_optimized(folio); if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; /* * If we don't know which subpages are hwpoisoned, we can't free * the hugepage, so it's leaked intentionally. */ if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; /* * If folio is not vmemmap optimized (!clear_dtor), then the folio * is no longer identified as a hugetlb page. hugetlb_vmemmap_restore_folio * can only be passed hugetlb pages and will BUG otherwise. */ if (clear_dtor && hugetlb_vmemmap_restore_folio(h, folio)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the * page and put the page back on the hugetlb free list and treat * as a surplus page. */ add_hugetlb_folio(h, folio, true); spin_unlock_irq(&hugetlb_lock); return; } /* * Move PageHWPoison flag from head page to the raw error pages, * which makes any healthy subpages reusable. */ if (unlikely(folio_test_hwpoison(folio))) folio_clear_hugetlb_hwpoison(folio); /* * If vmemmap pages were allocated above, then we need to clear the * hugetlb destructor under the hugetlb lock. */ if (clear_dtor) { spin_lock_irq(&hugetlb_lock); __clear_hugetlb_destructor(h, folio); spin_unlock_irq(&hugetlb_lock); } /* * Non-gigantic pages demoted from CMA allocated gigantic pages * need to be given back to CMA in free_gigantic_folio. */ if (hstate_is_gigantic(h) || hugetlb_cma_folio(folio, huge_page_order(h))) { destroy_compound_gigantic_folio(folio, huge_page_order(h)); free_gigantic_folio(folio, huge_page_order(h)); } else { __free_pages(&folio->page, huge_page_order(h)); } } /* * As update_and_free_hugetlb_folio() can be called under any context, so we cannot * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate * the vmemmap pages. * * free_hpage_workfn() locklessly retrieves the linked list of pages to be * freed and frees them one-by-one. As the page->mapping pointer is going * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node * structure of a lockless linked list of huge pages to be freed. */ static LLIST_HEAD(hpage_freelist); static void free_hpage_workfn(struct work_struct *work) { struct llist_node *node; node = llist_del_all(&hpage_freelist); while (node) { struct folio *folio; struct hstate *h; folio = container_of((struct address_space **)node, struct folio, mapping); node = node->next; folio->mapping = NULL; /* * The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in * folio_hstate() is going to trigger because a previous call to * remove_hugetlb_folio() will clear the hugetlb bit, so do * not use folio_hstate() directly. */ h = size_to_hstate(folio_size(folio)); __update_and_free_hugetlb_folio(h, folio); cond_resched(); } } static DECLARE_WORK(free_hpage_work, free_hpage_workfn); static inline void flush_free_hpage_work(struct hstate *h) { if (hugetlb_vmemmap_optimizable(h)) flush_work(&free_hpage_work); } static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, bool atomic) { if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) { __update_and_free_hugetlb_folio(h, folio); return; } /* * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages. * * Only call schedule_work() if hpage_freelist is previously * empty. Otherwise, schedule_work() had been called but the workfn * hasn't retrieved the list yet. */ if (llist_add((struct llist_node *)&folio->mapping, &hpage_freelist)) schedule_work(&free_hpage_work); } static void bulk_vmemmap_restore_error(struct hstate *h, struct list_head *folio_list, struct list_head *non_hvo_folios) { struct folio *folio, *t_folio; if (!list_empty(non_hvo_folios)) { /* * Free any restored hugetlb pages so that restore of the * entire list can be retried. * The idea is that in the common case of ENOMEM errors freeing * hugetlb pages with vmemmap we will free up memory so that we * can allocate vmemmap for more hugetlb pages. */ list_for_each_entry_safe(folio, t_folio, non_hvo_folios, lru) { list_del(&folio->lru); spin_lock_irq(&hugetlb_lock); __clear_hugetlb_destructor(h, folio); spin_unlock_irq(&hugetlb_lock); update_and_free_hugetlb_folio(h, folio, false); cond_resched(); } } else { /* * In the case where there are no folios which can be * immediately freed, we loop through the list trying to restore * vmemmap individually in the hope that someone elsewhere may * have done something to cause success (such as freeing some * memory). If unable to restore a hugetlb page, the hugetlb * page is made a surplus page and removed from the list. * If are able to restore vmemmap and free one hugetlb page, we * quit processing the list to retry the bulk operation. */ list_for_each_entry_safe(folio, t_folio, folio_list, lru) if (hugetlb_vmemmap_restore_folio(h, folio)) { list_del(&folio->lru); spin_lock_irq(&hugetlb_lock); add_hugetlb_folio(h, folio, true); spin_unlock_irq(&hugetlb_lock); } else { list_del(&folio->lru); spin_lock_irq(&hugetlb_lock); __clear_hugetlb_destructor(h, folio); spin_unlock_irq(&hugetlb_lock); update_and_free_hugetlb_folio(h, folio, false); cond_resched(); break; } } } static void update_and_free_pages_bulk(struct hstate *h, struct list_head *folio_list) { long ret; struct folio *folio, *t_folio; LIST_HEAD(non_hvo_folios); /* * First allocate required vmemmmap (if necessary) for all folios. * Carefully handle errors and free up any available hugetlb pages * in an effort to make forward progress. */ retry: ret = hugetlb_vmemmap_restore_folios(h, folio_list, &non_hvo_folios); if (ret < 0) { bulk_vmemmap_restore_error(h, folio_list, &non_hvo_folios); goto retry; } /* * At this point, list should be empty, ret should be >= 0 and there * should only be pages on the non_hvo_folios list. * Do note that the non_hvo_folios list could be empty. * Without HVO enabled, ret will be 0 and there is no need to call * __clear_hugetlb_destructor as this was done previously. */ VM_WARN_ON(!list_empty(folio_list)); VM_WARN_ON(ret < 0); if (!list_empty(&non_hvo_folios) && ret) { spin_lock_irq(&hugetlb_lock); list_for_each_entry(folio, &non_hvo_folios, lru) __clear_hugetlb_destructor(h, folio); spin_unlock_irq(&hugetlb_lock); } list_for_each_entry_safe(folio, t_folio, &non_hvo_folios, lru) { update_and_free_hugetlb_folio(h, folio, false); cond_resched(); } } struct hstate *size_to_hstate(unsigned long size) { struct hstate *h; for_each_hstate(h) { if (huge_page_size(h) == size) return h; } return NULL; } void free_huge_folio(struct folio *folio) { /* * Can't pass hstate in here because it is called from the * compound page destructor. */ struct hstate *h = folio_hstate(folio); int nid = folio_nid(folio); struct hugepage_subpool *spool = hugetlb_folio_subpool(folio); bool restore_reserve; unsigned long flags; VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); VM_BUG_ON_FOLIO(folio_mapcount(folio), folio); hugetlb_set_folio_subpool(folio, NULL); if (folio_test_anon(folio)) __ClearPageAnonExclusive(&folio->page); folio->mapping = NULL; restore_reserve = folio_test_hugetlb_restore_reserve(folio); folio_clear_hugetlb_restore_reserve(folio); /* * If HPageRestoreReserve was set on page, page allocation consumed a * reservation. If the page was associated with a subpool, there * would have been a page reserved in the subpool before allocation * via hugepage_subpool_get_pages(). Since we are 'restoring' the * reservation, do not call hugepage_subpool_put_pages() as this will * remove the reserved page from the subpool. */ if (!restore_reserve) { /* * A return code of zero implies that the subpool will be * under its minimum size if the reservation is not restored * after page is free. Therefore, force restore_reserve * operation. */ if (hugepage_subpool_put_pages(spool, 1) == 0) restore_reserve = true; } spin_lock_irqsave(&hugetlb_lock, flags); folio_clear_hugetlb_migratable(folio); hugetlb_cgroup_uncharge_folio(hstate_index(h), pages_per_huge_page(h), folio); hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); mem_cgroup_uncharge(folio); if (restore_reserve) h->resv_huge_pages++; if (folio_test_hugetlb_temporary(folio)) { remove_hugetlb_folio(h, folio, false); spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_hugetlb_folio(h, folio, true); } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ remove_hugetlb_folio(h, folio, true); spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_hugetlb_folio(h, folio, true); } else { arch_clear_hugepage_flags(&folio->page); enqueue_hugetlb_folio(h, folio); spin_unlock_irqrestore(&hugetlb_lock, flags); } } /* * Must be called with the hugetlb lock held */ static void __prep_account_new_huge_page(struct hstate *h, int nid) { lockdep_assert_held(&hugetlb_lock); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; } static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio) { folio_set_hugetlb(folio); INIT_LIST_HEAD(&folio->lru); hugetlb_set_folio_subpool(folio, NULL); set_hugetlb_cgroup(folio, NULL); set_hugetlb_cgroup_rsvd(folio, NULL); } static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) { init_new_hugetlb_folio(h, folio); hugetlb_vmemmap_optimize_folio(h, folio); } static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid) { __prep_new_hugetlb_folio(h, folio); spin_lock_irq(&hugetlb_lock); __prep_account_new_huge_page(h, nid); spin_unlock_irq(&hugetlb_lock); } static bool __prep_compound_gigantic_folio(struct folio *folio, unsigned int order, bool demote) { int i, j; int nr_pages = 1 << order; struct page *p; __folio_clear_reserved(folio); for (i = 0; i < nr_pages; i++) { p = folio_page(folio, i); /* * For gigantic hugepages allocated through bootmem at * boot, it's safer to be consistent with the not-gigantic * hugepages and clear the PG_reserved bit from all tail pages * too. Otherwise drivers using get_user_pages() to access tail * pages may get the reference counting wrong if they see * PG_reserved set on a tail page (despite the head page not * having PG_reserved set). Enforcing this consistency between * head and tail pages allows drivers to optimize away a check * on the head page when they need know if put_page() is needed * after get_user_pages(). */ if (i != 0) /* head page cleared above */ __ClearPageReserved(p); /* * Subtle and very unlikely * * Gigantic 'page allocators' such as memblock or cma will * return a set of pages with each page ref counted. We need * to turn this set of pages into a compound page with tail * page ref counts set to zero. Code such as speculative page * cache adding could take a ref on a 'to be' tail page. * We need to respect any increased ref count, and only set * the ref count to zero if count is currently 1. If count * is not 1, we return an error. An error return indicates * the set of pages can not be converted to a gigantic page. * The caller who allocated the pages should then discard the * pages using the appropriate free interface. * * In the case of demote, the ref count will be zero. */ if (!demote) { if (!page_ref_freeze(p, 1)) { pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); goto out_error; } } else { VM_BUG_ON_PAGE(page_count(p), p); } if (i != 0) set_compound_head(p, &folio->page); } __folio_set_head(folio); /* we rely on prep_new_hugetlb_folio to set the destructor */ folio_set_order(folio, order); atomic_set(&folio->_entire_mapcount, -1); atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); return true; out_error: /* undo page modifications made above */ for (j = 0; j < i; j++) { p = folio_page(folio, j); if (j != 0) clear_compound_head(p); set_page_refcounted(p); } /* need to clear PG_reserved on remaining tail pages */ for (; j < nr_pages; j++) { p = folio_page(folio, j); __ClearPageReserved(p); } return false; } static bool prep_compound_gigantic_folio(struct folio *folio, unsigned int order) { return __prep_compound_gigantic_folio(folio, order, false); } static bool prep_compound_gigantic_folio_for_demote(struct folio *folio, unsigned int order) { return __prep_compound_gigantic_folio(folio, order, true); } /* * PageHuge() only returns true for hugetlbfs pages, but not for normal or * transparent huge pages. See the PageTransHuge() documentation for more * details. */ int PageHuge(struct page *page) { struct folio *folio; if (!PageCompound(page)) return 0; folio = page_folio(page); return folio_test_hugetlb(folio); } EXPORT_SYMBOL_GPL(PageHuge); /* * Find and lock address space (mapping) in write mode. * * Upon entry, the page is locked which means that page_mapping() is * stable. Due to locking order, we can only trylock_write. If we can * not get the lock, simply return NULL to caller. */ struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) { struct address_space *mapping = page_mapping(hpage); if (!mapping) return mapping; if (i_mmap_trylock_write(mapping)) return mapping; return NULL; } static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { int order = huge_page_order(h); struct page *page; bool alloc_try_hard = true; bool retry = true; /* * By default we always try hard to allocate the page with * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in * a loop (to adjust global huge page counts) and previous allocation * failed, do not continue to try hard on the same node. Use the * node_alloc_noretry bitmap to manage this state information. */ if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry)) alloc_try_hard = false; gfp_mask |= __GFP_COMP|__GFP_NOWARN; if (alloc_try_hard) gfp_mask |= __GFP_RETRY_MAYFAIL; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); retry: page = __alloc_pages(gfp_mask, order, nid, nmask); /* Freeze head page */ if (page && !page_ref_freeze(page, 1)) { __free_pages(page, order); if (retry) { /* retry once */ retry = false; goto retry; } /* WOW! twice in a row. */ pr_warn("HugeTLB head page unexpected inflated ref count\n"); page = NULL; } /* * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this * indicates an overall state change. Clear bit so that we resume * normal 'try hard' allocations. */ if (node_alloc_noretry && page && !alloc_try_hard) node_clear(nid, *node_alloc_noretry); /* * If we tried hard to get a page but failed, set bit so that * subsequent attempts will not try as hard until there is an * overall state change. */ if (node_alloc_noretry && !page && alloc_try_hard) node_set(nid, *node_alloc_noretry); if (!page) { __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); return NULL; } __count_vm_event(HTLB_BUDDY_PGALLOC); return page_folio(page); } static struct folio *__alloc_fresh_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { struct folio *folio; bool retry = false; retry: if (hstate_is_gigantic(h)) folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); else folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, node_alloc_noretry); if (!folio) return NULL; if (hstate_is_gigantic(h)) { if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) { /* * Rare failure to convert pages to compound page. * Free pages and try again - ONCE! */ free_gigantic_folio(folio, huge_page_order(h)); if (!retry) { retry = true; goto retry; } return NULL; } } return folio; } static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { struct folio *folio; folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, node_alloc_noretry); if (folio) init_new_hugetlb_folio(h, folio); return folio; } /* * Common helper to allocate a fresh hugetlb page. All specific allocators * should use this function to get new hugetlb pages * * Note that returned page is 'frozen': ref count of head page and all tail * pages is zero. */ static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { struct folio *folio; folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, node_alloc_noretry); if (!folio) return NULL; prep_new_hugetlb_folio(h, folio, folio_nid(folio)); return folio; } static void prep_and_add_allocated_folios(struct hstate *h, struct list_head *folio_list) { unsigned long flags; struct folio *folio, *tmp_f; /* Send list for bulk vmemmap optimization processing */ hugetlb_vmemmap_optimize_folios(h, folio_list); /* Add all new pool pages to free lists in one lock cycle */ spin_lock_irqsave(&hugetlb_lock, flags); list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { __prep_account_new_huge_page(h, folio_nid(folio)); enqueue_hugetlb_folio(h, folio); } spin_unlock_irqrestore(&hugetlb_lock, flags); } /* * Allocates a fresh hugetlb page in a node interleaved manner. The page * will later be added to the appropriate hugetlb pool. */ static struct folio *alloc_pool_huge_folio(struct hstate *h, nodemask_t *nodes_allowed, nodemask_t *node_alloc_noretry) { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; int nr_nodes, node; for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { struct folio *folio; folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, node, nodes_allowed, node_alloc_noretry); if (folio) return folio; } return NULL; } /* * Remove huge page from pool from next node to free. Attempt to keep * persistent huge pages more or less balanced over allowed nodes. * This routine only 'removes' the hugetlb page. The caller must make * an additional call to free the page to low level allocators. * Called with hugetlb_lock locked. */ static struct folio *remove_pool_hugetlb_folio(struct hstate *h, nodemask_t *nodes_allowed, bool acct_surplus) { int nr_nodes, node; struct folio *folio = NULL; lockdep_assert_held(&hugetlb_lock); for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { /* * If we're returning unused surplus pages, only examine * nodes with surplus pages. */ if ((!acct_surplus || h->surplus_huge_pages_node[node]) && !list_empty(&h->hugepage_freelists[node])) { folio = list_entry(h->hugepage_freelists[node].next, struct folio, lru); remove_hugetlb_folio(h, folio, acct_surplus); break; } } return folio; } /* * Dissolve a given free hugepage into free buddy pages. This function does * nothing for in-use hugepages and non-hugepages. * This function returns values like below: * * -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages * when the system is under memory pressure and the feature of * freeing unused vmemmap pages associated with each hugetlb page * is enabled. * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use * (allocated or reserved.) * 0: successfully dissolved free hugepages or the page is not a * hugepage (considered as already dissolved) */ int dissolve_free_huge_page(struct page *page) { int rc = -EBUSY; struct folio *folio = page_folio(page); retry: /* Not to disrupt normal path by vainly holding hugetlb_lock */ if (!folio_test_hugetlb(folio)) return 0; spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(folio)) { rc = 0; goto out; } if (!folio_ref_count(folio)) { struct hstate *h = folio_hstate(folio); if (!available_huge_pages(h)) goto out; /* * We should make sure that the page is already on the free list * when it is dissolved. */ if (unlikely(!folio_test_hugetlb_freed(folio))) { spin_unlock_irq(&hugetlb_lock); cond_resched(); /* * Theoretically, we should return -EBUSY when we * encounter this race. In fact, we have a chance * to successfully dissolve the page if we do a * retry. Because the race window is quite small. * If we seize this opportunity, it is an optimization * for increasing the success rate of dissolving page. */ goto retry; } remove_hugetlb_folio(h, folio, false); h->max_huge_pages--; spin_unlock_irq(&hugetlb_lock); /* * Normally update_and_free_hugtlb_folio will allocate required vmemmmap * before freeing the page. update_and_free_hugtlb_folio will fail to * free the page if it can not allocate required vmemmap. We * need to adjust max_huge_pages if the page is not freed. * Attempt to allocate vmemmmap here so that we can take * appropriate action on failure. * * The folio_test_hugetlb check here is because * remove_hugetlb_folio will clear hugetlb folio flag for * non-vmemmap optimized hugetlb folios. */ if (folio_test_hugetlb(folio)) { rc = hugetlb_vmemmap_restore_folio(h, folio); if (rc) { spin_lock_irq(&hugetlb_lock); add_hugetlb_folio(h, folio, false); h->max_huge_pages++; goto out; } } else rc = 0; update_and_free_hugetlb_folio(h, folio, false); return rc; } out: spin_unlock_irq(&hugetlb_lock); return rc; } /* * Dissolve free hugepages in a given pfn range. Used by memory hotplug to * make specified memory blocks removable from the system. * Note that this will dissolve a free gigantic hugepage completely, if any * part of it lies within the given range. * Also note that if dissolve_free_huge_page() returns with an error, all * free hugepages that were dissolved before that error are lost. */ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; struct page *page; int rc = 0; unsigned int order; struct hstate *h; if (!hugepages_supported()) return rc; order = huge_page_order(&default_hstate); for_each_hstate(h) order = min(order, huge_page_order(h)); for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) { page = pfn_to_page(pfn); rc = dissolve_free_huge_page(page); if (rc) break; } return rc; } /* * Allocates a fresh surplus page from the page allocator. */ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct folio *folio = NULL; if (hstate_is_gigantic(h)) return NULL; spin_lock_irq(&hugetlb_lock); if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) goto out_unlock; spin_unlock_irq(&hugetlb_lock); folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); if (!folio) return NULL; spin_lock_irq(&hugetlb_lock); /* * We could have raced with the pool size change. * Double check that and simply deallocate the new page * if we would end up overcommiting the surpluses. Abuse * temporary page to workaround the nasty free_huge_folio * codeflow */ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { folio_set_hugetlb_temporary(folio); spin_unlock_irq(&hugetlb_lock); free_huge_folio(folio); return NULL; } h->surplus_huge_pages++; h->surplus_huge_pages_node[folio_nid(folio)]++; out_unlock: spin_unlock_irq(&hugetlb_lock); return folio; } static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct folio *folio; if (hstate_is_gigantic(h)) return NULL; folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); if (!folio) return NULL; /* fresh huge pages are frozen */ folio_ref_unfreeze(folio, 1); /* * We do not account these pages as surplus because they are only * temporary and will be released properly on the last reference */ folio_set_hugetlb_temporary(folio); return folio; } /* * Use the VMA's mpolicy to allocate a huge page from the buddy. */ static struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { struct folio *folio = NULL; struct mempolicy *mpol; gfp_t gfp_mask = htlb_alloc_mask(h); int nid; nodemask_t *nodemask; nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); if (mpol_is_preferred_many(mpol)) { gfp_t gfp = gfp_mask | __GFP_NOWARN; gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask); /* Fallback to all nodes if page==NULL */ nodemask = NULL; } if (!folio) folio = alloc_surplus_hugetlb_folio(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); return folio; } /* folio migration callback function */ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { spin_lock_irq(&hugetlb_lock); if (available_huge_pages(h)) { struct folio *folio; folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, preferred_nid, nmask); if (folio) { spin_unlock_irq(&hugetlb_lock); return folio; } } spin_unlock_irq(&hugetlb_lock); return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); } /* * Increase the hugetlb pool such that it can accommodate a reservation * of size 'delta'. */ static int gather_surplus_pages(struct hstate *h, long delta) __must_hold(&hugetlb_lock) { LIST_HEAD(surplus_list); struct folio *folio, *tmp; int ret; long i; long needed, allocated; bool alloc_ok = true; lockdep_assert_held(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - h->free_huge_pages; if (needed <= 0) { h->resv_huge_pages += delta; return 0; } allocated = 0; ret = -ENOMEM; retry: spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), NUMA_NO_NODE, NULL); if (!folio) { alloc_ok = false; break; } list_add(&folio->lru, &surplus_list); cond_resched(); } allocated += i; /* * After retaking hugetlb_lock, we need to recalculate 'needed' * because either resv_huge_pages or free_huge_pages may have changed. */ spin_lock_irq(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - (h->free_huge_pages + allocated); if (needed > 0) { if (alloc_ok) goto retry; /* * We were not able to allocate enough pages to * satisfy the entire reservation so we free what * we've allocated so far. */ goto free; } /* * The surplus_list now contains _at_least_ the number of extra pages * needed to accommodate the reservation. Add the appropriate number * of pages to the hugetlb pool and free the extras back to the buddy * allocator. Commit the entire reservation here to prevent another * process from stealing the pages as they are added to the pool but * before they are reserved. */ needed += allocated; h->resv_huge_pages += delta; ret = 0; /* Free the needed pages to the hugetlb pool */ list_for_each_entry_safe(folio, tmp, &surplus_list, lru) { if ((--needed) < 0) break; /* Add the page to the hugetlb allocator */ enqueue_hugetlb_folio(h, folio); } free: spin_unlock_irq(&hugetlb_lock); /* * Free unnecessary surplus pages to the buddy allocator. * Pages have no ref count, call free_huge_folio directly. */ list_for_each_entry_safe(folio, tmp, &surplus_list, lru) free_huge_folio(folio); spin_lock_irq(&hugetlb_lock); return ret; } /* * This routine has two main purposes: * 1) Decrement the reservation count (resv_huge_pages) by the value passed * in unused_resv_pages. This corresponds to the prior adjustments made * to the associated reservation map. * 2) Free any unused surplus pages that may have been allocated to satisfy * the reservation. As many as unused_resv_pages may be freed. */ static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages) { unsigned long nr_pages; LIST_HEAD(page_list); lockdep_assert_held(&hugetlb_lock); /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) goto out; /* * Part (or even all) of the reservation could have been backed * by pre-allocated pages. Only free surplus pages. */ nr_pages = min(unused_resv_pages, h->surplus_huge_pages); /* * We want to release as many surplus pages as possible, spread * evenly across all nodes with memory. Iterate across these nodes * until we can no longer free unreserved surplus pages. This occurs * when the nodes with surplus pages have no free pages. * remove_pool_hugetlb_folio() will balance the freed pages across the * on-line nodes with memory and will handle the hstate accounting. */ while (nr_pages--) { struct folio *folio; folio = remove_pool_hugetlb_folio(h, &node_states[N_MEMORY], 1); if (!folio) goto out; list_add(&folio->lru, &page_list); } out: spin_unlock_irq(&hugetlb_lock); update_and_free_pages_bulk(h, &page_list); spin_lock_irq(&hugetlb_lock); } /* * vma_needs_reservation, vma_commit_reservation and vma_end_reservation * are used by the huge page allocation routines to manage reservations. * * vma_needs_reservation is called to determine if the huge page at addr * within the vma has an associated reservation. If a reservation is * needed, the value 1 is returned. The caller is then responsible for * managing the global reservation and subpool usage counts. After * the huge page has been allocated, vma_commit_reservation is called * to add the page to the reservation map. If the page allocation fails, * the reservation must be ended instead of committed. vma_end_reservation * is called in such cases. * * In the normal case, vma_commit_reservation returns the same value * as the preceding vma_needs_reservation call. The only time this * is not the case is if a reserve map was changed between calls. It * is the responsibility of the caller to notice the difference and * take appropriate action. * * vma_add_reservation is used in error paths where a reservation must * be restored when a newly allocated huge page must be freed. It is * to be called after calling vma_needs_reservation to determine if a * reservation exists. * * vma_del_reservation is used in error paths where an entry in the reserve * map was created during huge page allocation and must be removed. It is to * be called after calling vma_needs_reservation to determine if a reservation * exists. */ enum vma_resv_mode { VMA_NEEDS_RESV, VMA_COMMIT_RESV, VMA_END_RESV, VMA_ADD_RESV, VMA_DEL_RESV, }; static long __vma_reservation_common(struct hstate *h, struct vm_area_struct *vma, unsigned long addr, enum vma_resv_mode mode) { struct resv_map *resv; pgoff_t idx; long ret; long dummy_out_regions_needed; resv = vma_resv_map(vma); if (!resv) return 1; idx = vma_hugecache_offset(h, vma, addr); switch (mode) { case VMA_NEEDS_RESV: ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed); /* We assume that vma_reservation_* routines always operate on * 1 page, and that adding to resv map a 1 page entry can only * ever require 1 region. */ VM_BUG_ON(dummy_out_regions_needed != 1); break; case VMA_COMMIT_RESV: ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); /* region_add calls of range 1 should never fail. */ VM_BUG_ON(ret < 0); break; case VMA_END_RESV: region_abort(resv, idx, idx + 1, 1); ret = 0; break; case VMA_ADD_RESV: if (vma->vm_flags & VM_MAYSHARE) { ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); /* region_add calls of range 1 should never fail. */ VM_BUG_ON(ret < 0); } else { region_abort(resv, idx, idx + 1, 1); ret = region_del(resv, idx, idx + 1); } break; case VMA_DEL_RESV: if (vma->vm_flags & VM_MAYSHARE) { region_abort(resv, idx, idx + 1, 1); ret = region_del(resv, idx, idx + 1); } else { ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); /* region_add calls of range 1 should never fail. */ VM_BUG_ON(ret < 0); } break; default: BUG(); } if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV) return ret; /* * We know private mapping must have HPAGE_RESV_OWNER set. * * In most cases, reserves always exist for private mappings. * However, a file associated with mapping could have been * hole punched or truncated after reserves were consumed. * As subsequent fault on such a range will not use reserves. * Subtle - The reserve map for private mappings has the * opposite meaning than that of shared mappings. If NO * entry is in the reserve map, it means a reservation exists. * If an entry exists in the reserve map, it means the * reservation has already been consumed. As a result, the * return value of this routine is the opposite of the * value returned from reserve map manipulation routines above. */ if (ret > 0) return 0; if (ret == 0) return 1; return ret; } static long vma_needs_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV); } static long vma_commit_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV); } static void vma_end_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV); } static long vma_add_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV); } static long vma_del_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV); } /* * This routine is called to restore reservation information on error paths. * It should ONLY be called for folios allocated via alloc_hugetlb_folio(), * and the hugetlb mutex should remain held when calling this routine. * * It handles two specific cases: * 1) A reservation was in place and the folio consumed the reservation. * hugetlb_restore_reserve is set in the folio. * 2) No reservation was in place for the page, so hugetlb_restore_reserve is * not set. However, alloc_hugetlb_folio always updates the reserve map. * * In case 1, free_huge_folio later in the error path will increment the * global reserve count. But, free_huge_folio does not have enough context * to adjust the reservation map. This case deals primarily with private * mappings. Adjust the reserve map here to be consistent with global * reserve count adjustments to be made by free_huge_folio. Make sure the * reserve map indicates there is a reservation present. * * In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio. */ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct folio *folio) { long rc = vma_needs_reservation(h, vma, address); if (folio_test_hugetlb_restore_reserve(folio)) { if (unlikely(rc < 0)) /* * Rare out of memory condition in reserve map * manipulation. Clear hugetlb_restore_reserve so * that global reserve count will not be incremented * by free_huge_folio. This will make it appear * as though the reservation for this folio was * consumed. This may prevent the task from * faulting in the folio at a later time. This * is better than inconsistent global huge page * accounting of reserve counts. */ folio_clear_hugetlb_restore_reserve(folio); else if (rc) (void)vma_add_reservation(h, vma, address); else vma_end_reservation(h, vma, address); } else { if (!rc) { /* * This indicates there is an entry in the reserve map * not added by alloc_hugetlb_folio. We know it was added * before the alloc_hugetlb_folio call, otherwise * hugetlb_restore_reserve would be set on the folio. * Remove the entry so that a subsequent allocation * does not consume a reservation. */ rc = vma_del_reservation(h, vma, address); if (rc < 0) /* * VERY rare out of memory condition. Since * we can not delete the entry, set * hugetlb_restore_reserve so that the reserve * count will be incremented when the folio * is freed. This reserve will be consumed * on a subsequent allocation. */ folio_set_hugetlb_restore_reserve(folio); } else if (rc < 0) { /* * Rare out of memory condition from * vma_needs_reservation call. Memory allocation is * only attempted if a new entry is needed. Therefore, * this implies there is not an entry in the * reserve map. * * For shared mappings, no entry in the map indicates * no reservation. We are done. */ if (!(vma->vm_flags & VM_MAYSHARE)) /* * For private mappings, no entry indicates * a reservation is present. Since we can * not add an entry, set hugetlb_restore_reserve * on the folio so reserve count will be * incremented when freed. This reserve will * be consumed on a subsequent allocation. */ folio_set_hugetlb_restore_reserve(folio); } else /* * No reservation present, do nothing */ vma_end_reservation(h, vma, address); } } /* * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve * the old one * @h: struct hstate old page belongs to * @old_folio: Old folio to dissolve * @list: List to isolate the page in case we need to * Returns 0 on success, otherwise negated error. */ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, struct folio *old_folio, struct list_head *list) { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; int nid = folio_nid(old_folio); struct folio *new_folio; int ret = 0; /* * Before dissolving the folio, we need to allocate a new one for the * pool to remain stable. Here, we allocate the folio and 'prep' it * by doing everything but actually updating counters and adding to * the pool. This simplifies and let us do most of the processing * under the lock. */ new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL); if (!new_folio) return -ENOMEM; __prep_new_hugetlb_folio(h, new_folio); retry: spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(old_folio)) { /* * Freed from under us. Drop new_folio too. */ goto free_new; } else if (folio_ref_count(old_folio)) { bool isolated; /* * Someone has grabbed the folio, try to isolate it here. * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); isolated = isolate_hugetlb(old_folio, list); ret = isolated ? 0 : -EBUSY; spin_lock_irq(&hugetlb_lock); goto free_new; } else if (!folio_test_hugetlb_freed(old_folio)) { /* * Folio's refcount is 0 but it has not been enqueued in the * freelist yet. Race window is small, so we can succeed here if * we retry. */ spin_unlock_irq(&hugetlb_lock); cond_resched(); goto retry; } else { /* * Ok, old_folio is still a genuine free hugepage. Remove it from * the freelist and decrease the counters. These will be * incremented again when calling __prep_account_new_huge_page() * and enqueue_hugetlb_folio() for new_folio. The counters will * remain stable since this happens under the lock. */ remove_hugetlb_folio(h, old_folio, false); /* * Ref count on new_folio is already zero as it was dropped * earlier. It can be directly added to the pool free list. */ __prep_account_new_huge_page(h, nid); enqueue_hugetlb_folio(h, new_folio); /* * Folio has been replaced, we can safely free the old one. */ spin_unlock_irq(&hugetlb_lock); update_and_free_hugetlb_folio(h, old_folio, false); } return ret; free_new: spin_unlock_irq(&hugetlb_lock); /* Folio has a zero ref count, but needs a ref to be freed */ folio_ref_unfreeze(new_folio, 1); update_and_free_hugetlb_folio(h, new_folio, false); return ret; } int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) { struct hstate *h; struct folio *folio = page_folio(page); int ret = -EBUSY; /* * The page might have been dissolved from under our feet, so make sure * to carefully check the state under the lock. * Return success when racing as if we dissolved the page ourselves. */ spin_lock_irq(&hugetlb_lock); if (folio_test_hugetlb(folio)) { h = folio_hstate(folio); } else { spin_unlock_irq(&hugetlb_lock); return 0; } spin_unlock_irq(&hugetlb_lock); /* * Fence off gigantic pages as there is a cyclic dependency between * alloc_contig_range and them. Return -ENOMEM as this has the effect * of bailing out right away without further retrying. */ if (hstate_is_gigantic(h)) return -ENOMEM; if (folio_ref_count(folio) && isolate_hugetlb(folio, list)) ret = 0; else if (!folio_ref_count(folio)) ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); return ret; } struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct folio *folio; long map_chg, map_commit, nr_pages = pages_per_huge_page(h); long gbl_chg; int memcg_charge_ret, ret, idx; struct hugetlb_cgroup *h_cg = NULL; struct mem_cgroup *memcg; bool deferred_reserve; gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL; memcg = get_mem_cgroup_from_current(); memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages); if (memcg_charge_ret == -ENOMEM) { mem_cgroup_put(memcg); return ERR_PTR(-ENOMEM); } idx = hstate_index(h); /* * Examine the region/reserve map to determine if the process * has a reservation for the page to be allocated. A return * code of zero indicates a reservation exists (no change). */ map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); if (map_chg < 0) { if (!memcg_charge_ret) mem_cgroup_cancel_charge(memcg, nr_pages); mem_cgroup_put(memcg); return ERR_PTR(-ENOMEM); } /* * Processes that did not create the mapping will have no * reserves as indicated by the region/reserve map. Check * that the allocation will not exceed the subpool limit. * Allocations for MAP_NORESERVE mappings also need to be * checked against any subpool limit. */ if (map_chg || avoid_reserve) { gbl_chg = hugepage_subpool_get_pages(spool, 1); if (gbl_chg < 0) goto out_end_reservation; /* * Even though there was no reservation in the region/reserve * map, there could be reservations associated with the * subpool that can be used. This would be indicated if the * return value of hugepage_subpool_get_pages() is zero. * However, if avoid_reserve is specified we still avoid even * the subpool reservations. */ if (avoid_reserve) gbl_chg = 1; } /* If this allocation is not consuming a reservation, charge it now. */ deferred_reserve = map_chg || avoid_reserve; if (deferred_reserve) { ret = hugetlb_cgroup_charge_cgroup_rsvd( idx, pages_per_huge_page(h), &h_cg); if (ret) goto out_subpool_put; } ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); if (ret) goto out_uncharge_cgroup_reservation; spin_lock_irq(&hugetlb_lock); /* * glb_chg is passed to indicate whether or not a page must be taken * from the global free pool (global change). gbl_chg == 0 indicates * a reservation exists for the allocation. */ folio = dequeue_hugetlb_folio_vma(h, vma, addr, avoid_reserve, gbl_chg); if (!folio) { spin_unlock_irq(&hugetlb_lock); folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr); if (!folio) goto out_uncharge_cgroup; spin_lock_irq(&hugetlb_lock); if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { folio_set_hugetlb_restore_reserve(folio); h->resv_huge_pages--; } list_add(&folio->lru, &h->hugepage_activelist); folio_ref_unfreeze(folio, 1); /* Fall through */ } hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio); /* If allocation is not consuming a reservation, also store the * hugetlb_cgroup pointer on the page. */ if (deferred_reserve) { hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), h_cg, folio); } spin_unlock_irq(&hugetlb_lock); hugetlb_set_folio_subpool(folio, spool); map_commit = vma_commit_reservation(h, vma, addr); if (unlikely(map_chg > map_commit)) { /* * The page was added to the reservation map between * vma_needs_reservation and vma_commit_reservation. * This indicates a race with hugetlb_reserve_pages. * Adjust for the subpool count incremented above AND * in hugetlb_reserve_pages for the same page. Also, * the reservation count added in hugetlb_reserve_pages * no longer applies. */ long rsv_adjust; rsv_adjust = hugepage_subpool_put_pages(spool, 1); hugetlb_acct_memory(h, -rsv_adjust); if (deferred_reserve) hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); } if (!memcg_charge_ret) mem_cgroup_commit_charge(folio, memcg); mem_cgroup_put(memcg); return folio; out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); out_uncharge_cgroup_reservation: if (deferred_reserve) hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), h_cg); out_subpool_put: if (map_chg || avoid_reserve) hugepage_subpool_put_pages(spool, 1); out_end_reservation: vma_end_reservation(h, vma, addr); if (!memcg_charge_ret) mem_cgroup_cancel_charge(memcg, nr_pages); mem_cgroup_put(memcg); return ERR_PTR(-ENOSPC); } int alloc_bootmem_huge_page(struct hstate *h, int nid) __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); int __alloc_bootmem_huge_page(struct hstate *h, int nid) { struct huge_bootmem_page *m = NULL; /* initialize for clang */ int nr_nodes, node; /* do node specific alloc */ if (nid != NUMA_NO_NODE) { m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid); if (!m) return 0; goto found; } /* allocate from next node when distributing huge pages */ for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { m = memblock_alloc_try_nid_raw( huge_page_size(h), huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, node); /* * Use the beginning of the huge page to store the * huge_bootmem_page struct (until gather_bootmem * puts them into the mem_map). */ if (!m) return 0; goto found; } found: /* * Only initialize the head struct page in memmap_init_reserved_pages, * rest of the struct pages will be initialized by the HugeTLB * subsystem itself. * The head struct page is used to get folio information by the HugeTLB * subsystem like zone id and node id. */ memblock_reserved_mark_noinit(virt_to_phys((void *)m + PAGE_SIZE), huge_page_size(h) - PAGE_SIZE); /* Put them into a private list first because mem_map is not up yet */ INIT_LIST_HEAD(&m->list); list_add(&m->list, &huge_boot_pages); m->hstate = h; return 1; } /* Initialize [start_page:end_page_number] tail struct pages of a hugepage */ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, unsigned long start_page_number, unsigned long end_page_number) { enum zone_type zone = zone_idx(folio_zone(folio)); int nid = folio_nid(folio); unsigned long head_pfn = folio_pfn(folio); unsigned long pfn, end_pfn = head_pfn + end_page_number; int ret; for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) { struct page *page = pfn_to_page(pfn); __init_single_page(page, pfn, zone, nid); prep_compound_tail((struct page *)folio, pfn - head_pfn); ret = page_ref_freeze(page, 1); VM_BUG_ON(!ret); } } static void __init hugetlb_folio_init_vmemmap(struct folio *folio, struct hstate *h, unsigned long nr_pages) { int ret; /* Prepare folio head */ __folio_clear_reserved(folio); __folio_set_head(folio); ret = folio_ref_freeze(folio, 1); VM_BUG_ON(!ret); /* Initialize the necessary tail struct pages */ hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages); prep_compound_head((struct page *)folio, huge_page_order(h)); } static void __init prep_and_add_bootmem_folios(struct hstate *h, struct list_head *folio_list) { unsigned long flags; struct folio *folio, *tmp_f; /* Send list for bulk vmemmap optimization processing */ hugetlb_vmemmap_optimize_folios(h, folio_list); /* Add all new pool pages to free lists in one lock cycle */ spin_lock_irqsave(&hugetlb_lock, flags); list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { if (!folio_test_hugetlb_vmemmap_optimized(folio)) { /* * If HVO fails, initialize all tail struct pages * We do not worry about potential long lock hold * time as this is early in boot and there should * be no contention. */ hugetlb_folio_init_tail_vmemmap(folio, HUGETLB_VMEMMAP_RESERVE_PAGES, pages_per_huge_page(h)); } __prep_account_new_huge_page(h, folio_nid(folio)); enqueue_hugetlb_folio(h, folio); } spin_unlock_irqrestore(&hugetlb_lock, flags); } /* * Put bootmem huge pages into the standard lists after mem_map is up. * Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages. */ static void __init gather_bootmem_prealloc(void) { LIST_HEAD(folio_list); struct huge_bootmem_page *m; struct hstate *h = NULL, *prev_h = NULL; list_for_each_entry(m, &huge_boot_pages, list) { struct page *page = virt_to_page(m); struct folio *folio = (void *)page; h = m->hstate; /* * It is possible to have multiple huge page sizes (hstates) * in this list. If so, process each size separately. */ if (h != prev_h && prev_h != NULL) prep_and_add_bootmem_folios(prev_h, &folio_list); prev_h = h; VM_BUG_ON(!hstate_is_gigantic(h)); WARN_ON(folio_ref_count(folio) != 1); hugetlb_folio_init_vmemmap(folio, h, HUGETLB_VMEMMAP_RESERVE_PAGES); init_new_hugetlb_folio(h, folio); list_add(&folio->lru, &folio_list); /* * We need to restore the 'stolen' pages to totalram_pages * in order to fix confusing memory reports from free(1) and * other side-effects, like CommitLimit going negative. */ adjust_managed_page_count(page, pages_per_huge_page(h)); cond_resched(); } prep_and_add_bootmem_folios(h, &folio_list); } static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) { unsigned long i; char buf[32]; for (i = 0; i < h->max_huge_pages_node[nid]; ++i) { if (hstate_is_gigantic(h)) { if (!alloc_bootmem_huge_page(h, nid)) break; } else { struct folio *folio; gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, &node_states[N_MEMORY], NULL); if (!folio) break; free_huge_folio(folio); /* free it into the hugepage allocator */ } cond_resched(); } if (i == h->max_huge_pages_node[nid]) return; string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n", h->max_huge_pages_node[nid], buf, nid, i); h->max_huge_pages -= (h->max_huge_pages_node[nid] - i); h->max_huge_pages_node[nid] = i; } /* * NOTE: this routine is called in different contexts for gigantic and * non-gigantic pages. * - For gigantic pages, this is called early in the boot process and * pages are allocated from memblock allocated or something similar. * Gigantic pages are actually added to pools later with the routine * gather_bootmem_prealloc. * - For non-gigantic pages, this is called later in the boot process after * all of mm is up and functional. Pages are allocated from buddy and * then added to hugetlb pools. */ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) { unsigned long i; struct folio *folio; LIST_HEAD(folio_list); nodemask_t *node_alloc_noretry; bool node_specific_alloc = false; /* skip gigantic hugepages allocation if hugetlb_cma enabled */ if (hstate_is_gigantic(h) && hugetlb_cma_size) { pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); return; } /* do node specific alloc */ for_each_online_node(i) { if (h->max_huge_pages_node[i] > 0) { hugetlb_hstate_alloc_pages_onenode(h, i); node_specific_alloc = true; } } if (node_specific_alloc) return; /* below will do all node balanced alloc */ if (!hstate_is_gigantic(h)) { /* * Bit mask controlling how hard we retry per-node allocations. * Ignore errors as lower level routines can deal with * node_alloc_noretry == NULL. If this kmalloc fails at boot * time, we are likely in bigger trouble. */ node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry), GFP_KERNEL); } else { /* allocations done at boot time */ node_alloc_noretry = NULL; } /* bit mask controlling how hard we retry per-node allocations */ if (node_alloc_noretry) nodes_clear(*node_alloc_noretry); for (i = 0; i < h->max_huge_pages; ++i) { if (hstate_is_gigantic(h)) { /* * gigantic pages not added to list as they are not * added to pools now. */ if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE)) break; } else { folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], node_alloc_noretry); if (!folio) break; list_add(&folio->lru, &folio_list); } cond_resched(); } /* list will be empty if hstate_is_gigantic */ prep_and_add_allocated_folios(h, &folio_list); if (i < h->max_huge_pages) { char buf[32]; string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", h->max_huge_pages, buf, i); h->max_huge_pages = i; } kfree(node_alloc_noretry); } static void __init hugetlb_init_hstates(void) { struct hstate *h, *h2; for_each_hstate(h) { /* oversize hugepages were init'ed in early boot */ if (!hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); /* * Set demote order for each hstate. Note that * h->demote_order is initially 0. * - We can not demote gigantic pages if runtime freeing * is not supported, so skip this. * - If CMA allocation is possible, we can not demote * HUGETLB_PAGE_ORDER or smaller size pages. */ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) continue; if (hugetlb_cma_size && h->order <= HUGETLB_PAGE_ORDER) continue; for_each_hstate(h2) { if (h2 == h) continue; if (h2->order < h->order && h2->order > h->demote_order) h->demote_order = h2->order; } } } static void __init report_hugepages(void) { struct hstate *h; for_each_hstate(h) { char buf[32]; string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n", buf, h->free_huge_pages); pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n", hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf); } } #ifdef CONFIG_HIGHMEM static void try_to_free_low(struct hstate *h, unsigned long count, nodemask_t *nodes_allowed) { int i; LIST_HEAD(page_list); lockdep_assert_held(&hugetlb_lock); if (hstate_is_gigantic(h)) return; /* * Collect pages to be freed on a list, and free after dropping lock */ for_each_node_mask(i, *nodes_allowed) { struct folio *folio, *next; struct list_head *freel = &h->hugepage_freelists[i]; list_for_each_entry_safe(folio, next, freel, lru) { if (count >= h->nr_huge_pages) goto out; if (folio_test_highmem(folio)) continue; remove_hugetlb_folio(h, folio, false); list_add(&folio->lru, &page_list); } } out: spin_unlock_irq(&hugetlb_lock); update_and_free_pages_bulk(h, &page_list); spin_lock_irq(&hugetlb_lock); } #else static inline void try_to_free_low(struct hstate *h, unsigned long count, nodemask_t *nodes_allowed) { } #endif /* * Increment or decrement surplus_huge_pages. Keep node-specific counters * balanced by operating on them in a round-robin fashion. * Returns 1 if an adjustment was made. */ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, int delta) { int nr_nodes, node; lockdep_assert_held(&hugetlb_lock); VM_BUG_ON(delta != -1 && delta != 1); if (delta < 0) { for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { if (h->surplus_huge_pages_node[node]) goto found; } } else { for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { if (h->surplus_huge_pages_node[node] < h->nr_huge_pages_node[node]) goto found; } } return 0; found: h->surplus_huge_pages += delta; h->surplus_huge_pages_node[node] += delta; return 1; } #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) { unsigned long min_count; unsigned long allocated; struct folio *folio; LIST_HEAD(page_list); NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); /* * Bit mask controlling how hard we retry per-node allocations. * If we can not allocate the bit mask, do not attempt to allocate * the requested huge pages. */ if (node_alloc_noretry) nodes_clear(*node_alloc_noretry); else return -ENOMEM; /* * resize_lock mutex prevents concurrent adjustments to number of * pages in hstate via the proc/sysfs interfaces. */ mutex_lock(&h->resize_lock); flush_free_hpage_work(h); spin_lock_irq(&hugetlb_lock); /* * Check for a node specific request. * Changing node specific huge page count may require a corresponding * change to the global count. In any case, the passed node mask * (nodes_allowed) will restrict alloc/free to the specified node. */ if (nid != NUMA_NO_NODE) { unsigned long old_count = count; count += persistent_huge_pages(h) - (h->nr_huge_pages_node[nid] - h->surplus_huge_pages_node[nid]); /* * User may have specified a large count value which caused the * above calculation to overflow. In this case, they wanted * to allocate as many huge pages as possible. Set count to * largest possible value to align with their intention. */ if (count < old_count) count = ULONG_MAX; } /* * Gigantic pages runtime allocation depend on the capability for large * page range allocation. * If the system does not provide this feature, return an error when * the user tries to allocate gigantic pages but let the user free the * boottime allocated gigantic pages. */ if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { if (count > persistent_huge_pages(h)) { spin_unlock_irq(&hugetlb_lock); mutex_unlock(&h->resize_lock); NODEMASK_FREE(node_alloc_noretry); return -EINVAL; } /* Fall through to decrease pool */ } /* * Increase the pool size * First take pages out of surplus state. Then make up the * remaining difference by allocating fresh huge pages. * * We might race with alloc_surplus_hugetlb_folio() here and be unable * to convert a surplus huge page to a normal huge page. That is * not critical, though, it just means the overall size of the * pool might be one hugepage larger than it needs to be, but * within all the constraints specified by the sysctls. */ while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, -1)) break; } allocated = 0; while (count > (persistent_huge_pages(h) + allocated)) { /* * If this allocation races such that we no longer need the * page, free_huge_folio will handle it by freeing the page * and reducing the surplus. */ spin_unlock_irq(&hugetlb_lock); /* yield cpu to avoid soft lockup */ cond_resched(); folio = alloc_pool_huge_folio(h, nodes_allowed, node_alloc_noretry); if (!folio) { prep_and_add_allocated_folios(h, &page_list); spin_lock_irq(&hugetlb_lock); goto out; } list_add(&folio->lru, &page_list); allocated++; /* Bail for signals. Probably ctrl-c from user */ if (signal_pending(current)) { prep_and_add_allocated_folios(h, &page_list); spin_lock_irq(&hugetlb_lock); goto out; } spin_lock_irq(&hugetlb_lock); } /* Add allocated pages to the pool */ if (!list_empty(&page_list)) { spin_unlock_irq(&hugetlb_lock); prep_and_add_allocated_folios(h, &page_list); spin_lock_irq(&hugetlb_lock); } /* * Decrease the pool size * First return free pages to the buddy allocator (being careful * to keep enough around to satisfy reservations). Then place * pages into surplus state as needed so the pool will shrink * to the desired size as pages become free. * * By placing pages into the surplus state independent of the * overcommit value, we are allowing the surplus pool size to * exceed overcommit. There are few sane options here. Since * alloc_surplus_hugetlb_folio() is checking the global counter, * though, we'll note that we're not allowed to exceed surplus * and won't grow the pool anywhere else. Not until one of the * sysctls are changed, or the surplus pages go out of use. */ min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; min_count = max(count, min_count); try_to_free_low(h, min_count, nodes_allowed); /* * Collect pages to be removed on list without dropping lock */ while (min_count < persistent_huge_pages(h)) { folio = remove_pool_hugetlb_folio(h, nodes_allowed, 0); if (!folio) break; list_add(&folio->lru, &page_list); } /* free the pages after dropping lock */ spin_unlock_irq(&hugetlb_lock); update_and_free_pages_bulk(h, &page_list); flush_free_hpage_work(h); spin_lock_irq(&hugetlb_lock); while (count < persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, 1)) break; } out: h->max_huge_pages = persistent_huge_pages(h); spin_unlock_irq(&hugetlb_lock); mutex_unlock(&h->resize_lock); NODEMASK_FREE(node_alloc_noretry); return 0; } static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) { int i, nid = folio_nid(folio); struct hstate *target_hstate; struct page *subpage; struct folio *inner_folio; int rc = 0; target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order); remove_hugetlb_folio_for_demote(h, folio, false); spin_unlock_irq(&hugetlb_lock); /* * If vmemmap already existed for folio, the remove routine above would * have cleared the hugetlb folio flag. Hence the folio is technically * no longer a hugetlb folio. hugetlb_vmemmap_restore_folio can only be * passed hugetlb folios and will BUG otherwise. */ if (folio_test_hugetlb(folio)) { rc = hugetlb_vmemmap_restore_folio(h, folio); if (rc) { /* Allocation of vmemmmap failed, we can not demote folio */ spin_lock_irq(&hugetlb_lock); folio_ref_unfreeze(folio, 1); add_hugetlb_folio(h, folio, false); return rc; } } /* * Use destroy_compound_hugetlb_folio_for_demote for all huge page * sizes as it will not ref count folios. */ destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h)); /* * Taking target hstate mutex synchronizes with set_max_huge_pages. * Without the mutex, pages added to target hstate could be marked * as surplus. * * Note that we already hold h->resize_lock. To prevent deadlock, * use the convention of always taking larger size hstate mutex first. */ mutex_lock(&target_hstate->resize_lock); for (i = 0; i < pages_per_huge_page(h); i += pages_per_huge_page(target_hstate)) { subpage = folio_page(folio, i); inner_folio = page_folio(subpage); if (hstate_is_gigantic(target_hstate)) prep_compound_gigantic_folio_for_demote(inner_folio, target_hstate->order); else prep_compound_page(subpage, target_hstate->order); folio_change_private(inner_folio, NULL); prep_new_hugetlb_folio(target_hstate, inner_folio, nid); free_huge_folio(inner_folio); } mutex_unlock(&target_hstate->resize_lock); spin_lock_irq(&hugetlb_lock); /* * Not absolutely necessary, but for consistency update max_huge_pages * based on pool changes for the demoted page. */ h->max_huge_pages--; target_hstate->max_huge_pages += pages_per_huge_page(h) / pages_per_huge_page(target_hstate); return rc; } static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) __must_hold(&hugetlb_lock) { int nr_nodes, node; struct folio *folio; lockdep_assert_held(&hugetlb_lock); /* We should never get here if no demote order */ if (!h->demote_order) { pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n"); return -EINVAL; /* internal error */ } for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { list_for_each_entry(folio, &h->hugepage_freelists[node], lru) { if (folio_test_hwpoison(folio)) continue; return demote_free_hugetlb_folio(h, folio); } } /* * Only way to get here is if all pages on free lists are poisoned. * Return -EBUSY so that caller will not retry. */ return -EBUSY; } #define HSTATE_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) #define HSTATE_ATTR_WO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_WO(_name) #define HSTATE_ATTR(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RW(_name) static struct kobject *hugepages_kobj; static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) { int i; for (i = 0; i < HUGE_MAX_HSTATE; i++) if (hstate_kobjs[i] == kobj) { if (nidp) *nidp = NUMA_NO_NODE; return &hstates[i]; } return kobj_to_node_hstate(kobj, nidp); } static ssize_t nr_hugepages_show_common(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct hstate *h; unsigned long nr_huge_pages; int nid; h = kobj_to_hstate(kobj, &nid); if (nid == NUMA_NO_NODE) nr_huge_pages = h->nr_huge_pages; else nr_huge_pages = h->nr_huge_pages_node[nid]; return sysfs_emit(buf, "%lu\n", nr_huge_pages); } static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, struct hstate *h, int nid, unsigned long count, size_t len) { int err; nodemask_t nodes_allowed, *n_mask; if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return -EINVAL; if (nid == NUMA_NO_NODE) { /* * global hstate attribute */ if (!(obey_mempolicy && init_nodemask_of_mempolicy(&nodes_allowed))) n_mask = &node_states[N_MEMORY]; else n_mask = &nodes_allowed; } else { /* * Node specific request. count adjustment happens in * set_max_huge_pages() after acquiring hugetlb_lock. */ init_nodemask_of_node(&nodes_allowed, nid); n_mask = &nodes_allowed; } err = set_max_huge_pages(h, count, nid, n_mask); return err ? err : len; } static ssize_t nr_hugepages_store_common(bool obey_mempolicy, struct kobject *kobj, const char *buf, size_t len) { struct hstate *h; unsigned long count; int nid; int err; err = kstrtoul(buf, 10, &count); if (err) return err; h = kobj_to_hstate(kobj, &nid); return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); } static ssize_t nr_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return nr_hugepages_show_common(kobj, attr, buf); } static ssize_t nr_hugepages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { return nr_hugepages_store_common(false, kobj, buf, len); } HSTATE_ATTR(nr_hugepages); #ifdef CONFIG_NUMA /* * hstate attribute for optionally mempolicy-based constraint on persistent * huge page alloc/free. */ static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return nr_hugepages_show_common(kobj, attr, buf); } static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { return nr_hugepages_store_common(true, kobj, buf, len); } HSTATE_ATTR(nr_hugepages_mempolicy); #endif static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct hstate *h = kobj_to_hstate(kobj, NULL); return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages); } static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long input; struct hstate *h = kobj_to_hstate(kobj, NULL); if (hstate_is_gigantic(h)) return -EINVAL; err = kstrtoul(buf, 10, &input); if (err) return err; spin_lock_irq(&hugetlb_lock); h->nr_overcommit_huge_pages = input; spin_unlock_irq(&hugetlb_lock); return count; } HSTATE_ATTR(nr_overcommit_hugepages); static ssize_t free_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct hstate *h; unsigned long free_huge_pages; int nid; h = kobj_to_hstate(kobj, &nid); if (nid == NUMA_NO_NODE) free_huge_pages = h->free_huge_pages; else free_huge_pages = h->free_huge_pages_node[nid]; return sysfs_emit(buf, "%lu\n", free_huge_pages); } HSTATE_ATTR_RO(free_hugepages); static ssize_t resv_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct hstate *h = kobj_to_hstate(kobj, NULL); return sysfs_emit(buf, "%lu\n", h->resv_huge_pages); } HSTATE_ATTR_RO(resv_hugepages); static ssize_t surplus_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct hstate *h; unsigned long surplus_huge_pages; int nid; h = kobj_to_hstate(kobj, &nid); if (nid == NUMA_NO_NODE) surplus_huge_pages = h->surplus_huge_pages; else surplus_huge_pages = h->surplus_huge_pages_node[nid]; return sysfs_emit(buf, "%lu\n", surplus_huge_pages); } HSTATE_ATTR_RO(surplus_hugepages); static ssize_t demote_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { unsigned long nr_demote; unsigned long nr_available; nodemask_t nodes_allowed, *n_mask; struct hstate *h; int err; int nid; err = kstrtoul(buf, 10, &nr_demote); if (err) return err; h = kobj_to_hstate(kobj, &nid); if (nid != NUMA_NO_NODE) { init_nodemask_of_node(&nodes_allowed, nid); n_mask = &nodes_allowed; } else { n_mask = &node_states[N_MEMORY]; } /* Synchronize with other sysfs operations modifying huge pages */ mutex_lock(&h->resize_lock); spin_lock_irq(&hugetlb_lock); while (nr_demote) { /* * Check for available pages to demote each time thorough the * loop as demote_pool_huge_page will drop hugetlb_lock. */ if (nid != NUMA_NO_NODE) nr_available = h->free_huge_pages_node[nid]; else nr_available = h->free_huge_pages; nr_available -= h->resv_huge_pages; if (!nr_available) break; err = demote_pool_huge_page(h, n_mask); if (err) break; nr_demote--; } spin_unlock_irq(&hugetlb_lock); mutex_unlock(&h->resize_lock); if (err) return err; return len; } HSTATE_ATTR_WO(demote); static ssize_t demote_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct hstate *h = kobj_to_hstate(kobj, NULL); unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K; return sysfs_emit(buf, "%lukB\n", demote_size); } static ssize_t demote_size_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { struct hstate *h, *demote_hstate; unsigned long demote_size; unsigned int demote_order; demote_size = (unsigned long)memparse(buf, NULL); demote_hstate = size_to_hstate(demote_size); if (!demote_hstate) return -EINVAL; demote_order = demote_hstate->order; if (demote_order < HUGETLB_PAGE_ORDER) return -EINVAL; /* demote order must be smaller than hstate order */ h = kobj_to_hstate(kobj, NULL); if (demote_order >= h->order) return -EINVAL; /* resize_lock synchronizes access to demote size and writes */ mutex_lock(&h->resize_lock); h->demote_order = demote_order; mutex_unlock(&h->resize_lock); return count; } HSTATE_ATTR(demote_size); static struct attribute *hstate_attrs[] = { &nr_hugepages_attr.attr, &nr_overcommit_hugepages_attr.attr, &free_hugepages_attr.attr, &resv_hugepages_attr.attr, &surplus_hugepages_attr.attr, #ifdef CONFIG_NUMA &nr_hugepages_mempolicy_attr.attr, #endif NULL, }; static const struct attribute_group hstate_attr_group = { .attrs = hstate_attrs, }; static struct attribute *hstate_demote_attrs[] = { &demote_size_attr.attr, &demote_attr.attr, NULL, }; static const struct attribute_group hstate_demote_attr_group = { .attrs = hstate_demote_attrs, }; static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, struct kobject **hstate_kobjs, const struct attribute_group *hstate_attr_group) { int retval; int hi = hstate_index(h); hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); if (!hstate_kobjs[hi]) return -ENOMEM; retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); if (retval) { kobject_put(hstate_kobjs[hi]); hstate_kobjs[hi] = NULL; return retval; } if (h->demote_order) { retval = sysfs_create_group(hstate_kobjs[hi], &hstate_demote_attr_group); if (retval) { pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name); sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group); kobject_put(hstate_kobjs[hi]); hstate_kobjs[hi] = NULL; return retval; } } return 0; } #ifdef CONFIG_NUMA static bool hugetlb_sysfs_initialized __ro_after_init; /* * node_hstate/s - associate per node hstate attributes, via their kobjects, * with node devices in node_devices[] using a parallel array. The array * index of a node device or _hstate == node id. * This is here to avoid any static dependency of the node device driver, in * the base kernel, on the hugetlb module. */ struct node_hstate { struct kobject *hugepages_kobj; struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; }; static struct node_hstate node_hstates[MAX_NUMNODES]; /* * A subset of global hstate attributes for node devices */ static struct attribute *per_node_hstate_attrs[] = { &nr_hugepages_attr.attr, &free_hugepages_attr.attr, &surplus_hugepages_attr.attr, NULL, }; static const struct attribute_group per_node_hstate_attr_group = { .attrs = per_node_hstate_attrs, }; /* * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj. * Returns node id via non-NULL nidp. */ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) { int nid; for (nid = 0; nid < nr_node_ids; nid++) { struct node_hstate *nhs = &node_hstates[nid]; int i; for (i = 0; i < HUGE_MAX_HSTATE; i++) if (nhs->hstate_kobjs[i] == kobj) { if (nidp) *nidp = nid; return &hstates[i]; } } BUG(); return NULL; } /* * Unregister hstate attributes from a single node device. * No-op if no hstate attributes attached. */ void hugetlb_unregister_node(struct node *node) { struct hstate *h; struct node_hstate *nhs = &node_hstates[node->dev.id]; if (!nhs->hugepages_kobj) return; /* no hstate attributes */ for_each_hstate(h) { int idx = hstate_index(h); struct kobject *hstate_kobj = nhs->hstate_kobjs[idx]; if (!hstate_kobj) continue; if (h->demote_order) sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group); sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group); kobject_put(hstate_kobj); nhs->hstate_kobjs[idx] = NULL; } kobject_put(nhs->hugepages_kobj); nhs->hugepages_kobj = NULL; } /* * Register hstate attributes for a single node device. * No-op if attributes already registered. */ void hugetlb_register_node(struct node *node) { struct hstate *h; struct node_hstate *nhs = &node_hstates[node->dev.id]; int err; if (!hugetlb_sysfs_initialized) return; if (nhs->hugepages_kobj) return; /* already allocated */ nhs->hugepages_kobj = kobject_create_and_add("hugepages", &node->dev.kobj); if (!nhs->hugepages_kobj) return; for_each_hstate(h) { err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, nhs->hstate_kobjs, &per_node_hstate_attr_group); if (err) { pr_err("HugeTLB: Unable to add hstate %s for node %d\n", h->name, node->dev.id); hugetlb_unregister_node(node); break; } } } /* * hugetlb init time: register hstate attributes for all registered node * devices of nodes that have memory. All on-line nodes should have * registered their associated device by this time. */ static void __init hugetlb_register_all_nodes(void) { int nid; for_each_online_node(nid) hugetlb_register_node(node_devices[nid]); } #else /* !CONFIG_NUMA */ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) { BUG(); if (nidp) *nidp = -1; return NULL; } static void hugetlb_register_all_nodes(void) { } #endif #ifdef CONFIG_CMA static void __init hugetlb_cma_check(void); #else static inline __init void hugetlb_cma_check(void) { } #endif static void __init hugetlb_sysfs_init(void) { struct hstate *h; int err; hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); if (!hugepages_kobj) return; for_each_hstate(h) { err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, hstate_kobjs, &hstate_attr_group); if (err) pr_err("HugeTLB: Unable to add hstate %s", h->name); } #ifdef CONFIG_NUMA hugetlb_sysfs_initialized = true; #endif hugetlb_register_all_nodes(); } #ifdef CONFIG_SYSCTL static void hugetlb_sysctl_init(void); #else static inline void hugetlb_sysctl_init(void) { } #endif static int __init hugetlb_init(void) { int i; BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE < __NR_HPAGEFLAGS); if (!hugepages_supported()) { if (hugetlb_max_hstate || default_hstate_max_huge_pages) pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n"); return 0; } /* * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some * architectures depend on setup being done here. */ hugetlb_add_hstate(HUGETLB_PAGE_ORDER); if (!parsed_default_hugepagesz) { /* * If we did not parse a default huge page size, set * default_hstate_idx to HPAGE_SIZE hstate. And, if the * number of huge pages for this default size was implicitly * specified, set that here as well. * Note that the implicit setting will overwrite an explicit * setting. A warning will be printed in this case. */ default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE)); if (default_hstate_max_huge_pages) { if (default_hstate.max_huge_pages) { char buf[32]; string_get_size(huge_page_size(&default_hstate), 1, STRING_UNITS_2, buf, 32); pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n", default_hstate.max_huge_pages, buf); pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n", default_hstate_max_huge_pages); } default_hstate.max_huge_pages = default_hstate_max_huge_pages; for_each_online_node(i) default_hstate.max_huge_pages_node[i] = default_hugepages_in_node[i]; } } hugetlb_cma_check(); hugetlb_init_hstates(); gather_bootmem_prealloc(); report_hugepages(); hugetlb_sysfs_init(); hugetlb_cgroup_file_init(); hugetlb_sysctl_init(); #ifdef CONFIG_SMP num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); #else num_fault_mutexes = 1; #endif hugetlb_fault_mutex_table = kmalloc_array(num_fault_mutexes, sizeof(struct mutex), GFP_KERNEL); BUG_ON(!hugetlb_fault_mutex_table); for (i = 0; i < num_fault_mutexes; i++) mutex_init(&hugetlb_fault_mutex_table[i]); return 0; } subsys_initcall(hugetlb_init); /* Overwritten by architectures with more huge page sizes */ bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size) { return size == HPAGE_SIZE; } void __init hugetlb_add_hstate(unsigned int order) { struct hstate *h; unsigned long i; if (size_to_hstate(PAGE_SIZE << order)) { return; } BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); BUG_ON(order < order_base_2(__NR_USED_SUBPAGE)); h = &hstates[hugetlb_max_hstate++]; mutex_init(&h->resize_lock); h->order = order; h->mask = ~(huge_page_size(h) - 1); for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); INIT_LIST_HEAD(&h->hugepage_activelist); h->next_nid_to_alloc = first_memory_node; h->next_nid_to_free = first_memory_node; snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/SZ_1K); parsed_hstate = h; } bool __init __weak hugetlb_node_alloc_supported(void) { return true; } static void __init hugepages_clear_pages_in_node(void) { if (!hugetlb_max_hstate) { default_hstate_max_huge_pages = 0; memset(default_hugepages_in_node, 0, sizeof(default_hugepages_in_node)); } else { parsed_hstate->max_huge_pages = 0; memset(parsed_hstate->max_huge_pages_node, 0, sizeof(parsed_hstate->max_huge_pages_node)); } } /* * hugepages command line processing * hugepages normally follows a valid hugepagsz or default_hugepagsz * specification. If not, ignore the hugepages value. hugepages can also * be the first huge page command line option in which case it implicitly * specifies the number of huge pages for the default size. */ static int __init hugepages_setup(char *s) { unsigned long *mhp; static unsigned long *last_mhp; int node = NUMA_NO_NODE; int count; unsigned long tmp; char *p = s; if (!parsed_valid_hugepagesz) { pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); parsed_valid_hugepagesz = true; return 1; } /* * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter * yet, so this hugepages= parameter goes to the "default hstate". * Otherwise, it goes with the previously parsed hugepagesz or * default_hugepagesz. */ else if (!hugetlb_max_hstate) mhp = &default_hstate_max_huge_pages; else mhp = &parsed_hstate->max_huge_pages; if (mhp == last_mhp) { pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s); return 1; } while (*p) { count = 0; if (sscanf(p, "%lu%n", &tmp, &count) != 1) goto invalid; /* Parameter is node format */ if (p[count] == ':') { if (!hugetlb_node_alloc_supported()) { pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n"); return 1; } if (tmp >= MAX_NUMNODES || !node_online(tmp)) goto invalid; node = array_index_nospec(tmp, MAX_NUMNODES); p += count + 1; /* Parse hugepages */ if (sscanf(p, "%lu%n", &tmp, &count) != 1) goto invalid; if (!hugetlb_max_hstate) default_hugepages_in_node[node] = tmp; else parsed_hstate->max_huge_pages_node[node] = tmp; *mhp += tmp; /* Go to parse next node*/ if (p[count] == ',') p += count + 1; else break; } else { if (p != s) goto invalid; *mhp = tmp; break; } } /* * Global state is always initialized later in hugetlb_init. * But we need to allocate gigantic hstates here early to still * use the bootmem allocator. */ if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate)) hugetlb_hstate_alloc_pages(parsed_hstate); last_mhp = mhp; return 1; invalid: pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p); hugepages_clear_pages_in_node(); return 1; } __setup("hugepages=", hugepages_setup); /* * hugepagesz command line processing * A specific huge page size can only be specified once with hugepagesz. * hugepagesz is followed by hugepages on the command line. The global * variable 'parsed_valid_hugepagesz' is used to determine if prior * hugepagesz argument was valid. */ static int __init hugepagesz_setup(char *s) { unsigned long size; struct hstate *h; parsed_valid_hugepagesz = false; size = (unsigned long)memparse(s, NULL); if (!arch_hugetlb_valid_size(size)) { pr_err("HugeTLB: unsupported hugepagesz=%s\n", s); return 1; } h = size_to_hstate(size); if (h) { /* * hstate for this size already exists. This is normally * an error, but is allowed if the existing hstate is the * default hstate. More specifically, it is only allowed if * the number of huge pages for the default hstate was not * previously specified. */ if (!parsed_default_hugepagesz || h != &default_hstate || default_hstate.max_huge_pages) { pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s); return 1; } /* * No need to call hugetlb_add_hstate() as hstate already * exists. But, do set parsed_hstate so that a following * hugepages= parameter will be applied to this hstate. */ parsed_hstate = h; parsed_valid_hugepagesz = true; return 1; } hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); parsed_valid_hugepagesz = true; return 1; } __setup("hugepagesz=", hugepagesz_setup); /* * default_hugepagesz command line input * Only one instance of default_hugepagesz allowed on command line. */ static int __init default_hugepagesz_setup(char *s) { unsigned long size; int i; parsed_valid_hugepagesz = false; if (parsed_default_hugepagesz) { pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s); return 1; } size = (unsigned long)memparse(s, NULL); if (!arch_hugetlb_valid_size(size)) { pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s); return 1; } hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); parsed_valid_hugepagesz = true; parsed_default_hugepagesz = true; default_hstate_idx = hstate_index(size_to_hstate(size)); /* * The number of default huge pages (for this size) could have been * specified as the first hugetlb parameter: hugepages=X. If so, * then default_hstate_max_huge_pages is set. If the default huge * page size is gigantic (> MAX_PAGE_ORDER), then the pages must be * allocated here from bootmem allocator. */ if (default_hstate_max_huge_pages) { default_hstate.max_huge_pages = default_hstate_max_huge_pages; for_each_online_node(i) default_hstate.max_huge_pages_node[i] = default_hugepages_in_node[i]; if (hstate_is_gigantic(&default_hstate)) hugetlb_hstate_alloc_pages(&default_hstate); default_hstate_max_huge_pages = 0; } return 1; } __setup("default_hugepagesz=", default_hugepagesz_setup); static nodemask_t *policy_mbind_nodemask(gfp_t gfp) { #ifdef CONFIG_NUMA struct mempolicy *mpol = get_task_policy(current); /* * Only enforce MPOL_BIND policy which overlaps with cpuset policy * (from policy_nodemask) specifically for hugetlb case */ if (mpol->mode == MPOL_BIND && (apply_policy_zone(mpol, gfp_zone(gfp)) && cpuset_nodemask_valid_mems_allowed(&mpol->nodes))) return &mpol->nodes; #endif return NULL; } static unsigned int allowed_mems_nr(struct hstate *h) { int node; unsigned int nr = 0; nodemask_t *mbind_nodemask; unsigned int *array = h->free_huge_pages_node; gfp_t gfp_mask = htlb_alloc_mask(h); mbind_nodemask = policy_mbind_nodemask(gfp_mask); for_each_node_mask(node, cpuset_current_mems_allowed) { if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) nr += array[node]; } return nr; } #ifdef CONFIG_SYSCTL static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos, unsigned long *out) { struct ctl_table dup_table; /* * In order to avoid races with __do_proc_doulongvec_minmax(), we * can duplicate the @table and alter the duplicate of it. */ dup_table = *table; dup_table.data = out; return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos); } static int hugetlb_sysctl_handler_common(bool obey_mempolicy, struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; unsigned long tmp = h->max_huge_pages; int ret; if (!hugepages_supported()) return -EOPNOTSUPP; ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, &tmp); if (ret) goto out; if (write) ret = __nr_hugepages_store_common(obey_mempolicy, h, NUMA_NO_NODE, tmp, *length); out: return ret; } static int hugetlb_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { return hugetlb_sysctl_handler_common(false, table, write, buffer, length, ppos); } #ifdef CONFIG_NUMA static int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { return hugetlb_sysctl_handler_common(true, table, write, buffer, length, ppos); } #endif /* CONFIG_NUMA */ static int hugetlb_overcommit_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; unsigned long tmp; int ret; if (!hugepages_supported()) return -EOPNOTSUPP; tmp = h->nr_overcommit_huge_pages; if (write && hstate_is_gigantic(h)) return -EINVAL; ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, &tmp); if (ret) goto out; if (write) { spin_lock_irq(&hugetlb_lock); h->nr_overcommit_huge_pages = tmp; spin_unlock_irq(&hugetlb_lock); } out: return ret; } static struct ctl_table hugetlb_table[] = { { .procname = "nr_hugepages", .data = NULL, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_sysctl_handler, }, #ifdef CONFIG_NUMA { .procname = "nr_hugepages_mempolicy", .data = NULL, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = &hugetlb_mempolicy_sysctl_handler, }, #endif { .procname = "hugetlb_shm_group", .data = &sysctl_hugetlb_shm_group, .maxlen = sizeof(gid_t), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "nr_overcommit_hugepages", .data = NULL, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_overcommit_handler, }, { } }; static void hugetlb_sysctl_init(void) { register_sysctl_init("vm", hugetlb_table); } #endif /* CONFIG_SYSCTL */ void hugetlb_report_meminfo(struct seq_file *m) { struct hstate *h; unsigned long total = 0; if (!hugepages_supported()) return; for_each_hstate(h) { unsigned long count = h->nr_huge_pages; total += huge_page_size(h) * count; if (h == &default_hstate) seq_printf(m, "HugePages_Total: %5lu\n" "HugePages_Free: %5lu\n" "HugePages_Rsvd: %5lu\n" "HugePages_Surp: %5lu\n" "Hugepagesize: %8lu kB\n", count, h->free_huge_pages, h->resv_huge_pages, h->surplus_huge_pages, huge_page_size(h) / SZ_1K); } seq_printf(m, "Hugetlb: %8lu kB\n", total / SZ_1K); } int hugetlb_report_node_meminfo(char *buf, int len, int nid) { struct hstate *h = &default_hstate; if (!hugepages_supported()) return 0; return sysfs_emit_at(buf, len, "Node %d HugePages_Total: %5u\n" "Node %d HugePages_Free: %5u\n" "Node %d HugePages_Surp: %5u\n", nid, h->nr_huge_pages_node[nid], nid, h->free_huge_pages_node[nid], nid, h->surplus_huge_pages_node[nid]); } void hugetlb_show_meminfo_node(int nid) { struct hstate *h; if (!hugepages_supported()) return; for_each_hstate(h) printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", nid, h->nr_huge_pages_node[nid], h->free_huge_pages_node[nid], h->surplus_huge_pages_node[nid], huge_page_size(h) / SZ_1K); } void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) { seq_printf(m, "HugetlbPages:\t%8lu kB\n", K(atomic_long_read(&mm->hugetlb_usage))); } /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ unsigned long hugetlb_total_pages(void) { struct hstate *h; unsigned long nr_total_pages = 0; for_each_hstate(h) nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h); return nr_total_pages; } static int hugetlb_acct_memory(struct hstate *h, long delta) { int ret = -ENOMEM; if (!delta) return 0; spin_lock_irq(&hugetlb_lock); /* * When cpuset is configured, it breaks the strict hugetlb page * reservation as the accounting is done on a global variable. Such * reservation is completely rubbish in the presence of cpuset because * the reservation is not checked against page availability for the * current cpuset. Application can still potentially OOM'ed by kernel * with lack of free htlb page in cpuset that the task is in. * Attempt to enforce strict accounting with cpuset is almost * impossible (or too ugly) because cpuset is too fluid that * task or memory node can be dynamically moved between cpusets. * * The change of semantics for shared hugetlb mapping with cpuset is * undesirable. However, in order to preserve some of the semantics, * we fall back to check against current free page availability as * a best attempt and hopefully to minimize the impact of changing * semantics that cpuset has. * * Apart from cpuset, we also have memory policy mechanism that * also determines from which node the kernel will allocate memory * in a NUMA system. So similar to cpuset, we also should consider * the memory policy of the current task. Similar to the description * above. */ if (delta > 0) { if (gather_surplus_pages(h, delta) < 0) goto out; if (delta > allowed_mems_nr(h)) { return_unused_surplus_pages(h, delta); goto out; } } ret = 0; if (delta < 0) return_unused_surplus_pages(h, (unsigned long) -delta); out: spin_unlock_irq(&hugetlb_lock); return ret; } static void hugetlb_vm_op_open(struct vm_area_struct *vma) { struct resv_map *resv = vma_resv_map(vma); /* * HPAGE_RESV_OWNER indicates a private mapping. * This new VMA should share its siblings reservation map if present. * The VMA will only ever have a valid reservation map pointer where * it is being copied for another still existing VMA. As that VMA * has a reference to the reservation map it cannot disappear until * after this open call completes. It is therefore safe to take a * new reference here without additional locking. */ if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { resv_map_dup_hugetlb_cgroup_uncharge_info(resv); kref_get(&resv->refs); } /* * vma_lock structure for sharable mappings is vma specific. * Clear old pointer (if copied via vm_area_dup) and allocate * new structure. Before clearing, make sure vma_lock is not * for this vma. */ if (vma->vm_flags & VM_MAYSHARE) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; if (vma_lock) { if (vma_lock->vma != vma) { vma->vm_private_data = NULL; hugetlb_vma_lock_alloc(vma); } else pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__); } else hugetlb_vma_lock_alloc(vma); } } static void hugetlb_vm_op_close(struct vm_area_struct *vma) { struct hstate *h = hstate_vma(vma); struct resv_map *resv; struct hugepage_subpool *spool = subpool_vma(vma); unsigned long reserve, start, end; long gbl_reserve; hugetlb_vma_lock_free(vma); resv = vma_resv_map(vma); if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return; start = vma_hugecache_offset(h, vma, vma->vm_start); end = vma_hugecache_offset(h, vma, vma->vm_end); reserve = (end - start) - region_count(resv, start, end); hugetlb_cgroup_uncharge_counter(resv, start, end); if (reserve) { /* * Decrement reserve counts. The global reserve count may be * adjusted if the subpool has a minimum size. */ gbl_reserve = hugepage_subpool_put_pages(spool, reserve); hugetlb_acct_memory(h, -gbl_reserve); } kref_put(&resv->refs, resv_map_release); } static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) { if (addr & ~(huge_page_mask(hstate_vma(vma)))) return -EINVAL; /* * PMD sharing is only possible for PUD_SIZE-aligned address ranges * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this * split, unshare PMDs in the PUD_SIZE interval surrounding addr now. */ if (addr & ~PUD_MASK) { /* * hugetlb_vm_op_split is called right before we attempt to * split the VMA. We will need to unshare PMDs in the old and * new VMAs, so let's unshare before we split. */ unsigned long floor = addr & PUD_MASK; unsigned long ceil = floor + PUD_SIZE; if (floor >= vma->vm_start && ceil <= vma->vm_end) hugetlb_unshare_pmds(vma, floor, ceil); } return 0; } static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) { return huge_page_size(hstate_vma(vma)); } /* * We cannot handle pagefaults against hugetlb pages at all. They cause * handle_mm_fault() to try to instantiate regular-sized pages in the * hugepage VMA. do_page_fault() is supposed to trap this, so BUG is we get * this far. */ static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) { BUG(); return 0; } /* * When a new function is introduced to vm_operations_struct and added * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops. * This is because under System V memory model, mappings created via * shmget/shmat with "huge page" specified are backed by hugetlbfs files, * their original vm_ops are overwritten with shm_vm_ops. */ const struct vm_operations_struct hugetlb_vm_ops = { .fault = hugetlb_vm_op_fault, .open = hugetlb_vm_op_open, .close = hugetlb_vm_op_close, .may_split = hugetlb_vm_op_split, .pagesize = hugetlb_vm_op_pagesize, }; static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, int writable) { pte_t entry; unsigned int shift = huge_page_shift(hstate_vma(vma)); if (writable) { entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, vma->vm_page_prot))); } else { entry = huge_pte_wrprotect(mk_huge_pte(page, vma->vm_page_prot)); } entry = pte_mkyoung(entry); entry = arch_make_huge_pte(entry, shift, vma->vm_flags); return entry; } static void set_huge_ptep_writable(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { pte_t entry; entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep))); if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) update_mmu_cache(vma, address, ptep); } bool is_hugetlb_entry_migration(pte_t pte) { swp_entry_t swp; if (huge_pte_none(pte) || pte_present(pte)) return false; swp = pte_to_swp_entry(pte); if (is_migration_entry(swp)) return true; else return false; } bool is_hugetlb_entry_hwpoisoned(pte_t pte) { swp_entry_t swp; if (huge_pte_none(pte) || pte_present(pte)) return false; swp = pte_to_swp_entry(pte); if (is_hwpoison_entry(swp)) return true; else return false; } static void hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, struct folio *new_folio, pte_t old, unsigned long sz) { pte_t newpte = make_huge_pte(vma, &new_folio->page, 1); __folio_mark_uptodate(new_folio); hugetlb_add_new_anon_rmap(new_folio, vma, addr); if (userfaultfd_wp(vma) && huge_pte_uffd_wp(old)) newpte = huge_pte_mkuffd_wp(newpte); set_huge_pte_at(vma->vm_mm, addr, ptep, newpte, sz); hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); folio_set_hugetlb_migratable(new_folio); } int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { pte_t *src_pte, *dst_pte, entry; struct folio *pte_folio; unsigned long addr; bool cow = is_cow_mapping(src_vma->vm_flags); struct hstate *h = hstate_vma(src_vma); unsigned long sz = huge_page_size(h); unsigned long npages = pages_per_huge_page(h); struct mmu_notifier_range range; unsigned long last_addr_mask; int ret = 0; if (cow) { mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src, src_vma->vm_start, src_vma->vm_end); mmu_notifier_invalidate_range_start(&range); vma_assert_write_locked(src_vma); raw_write_seqcount_begin(&src->write_protect_seq); } else { /* * For shared mappings the vma lock must be held before * calling hugetlb_walk() in the src vma. Otherwise, the * returned ptep could go away if part of a shared pmd and * another thread calls huge_pmd_unshare. */ hugetlb_vma_lock_read(src_vma); } last_addr_mask = hugetlb_mask_last_page(h); for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; src_pte = hugetlb_walk(src_vma, addr, sz); if (!src_pte) { addr |= last_addr_mask; continue; } dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz); if (!dst_pte) { ret = -ENOMEM; break; } /* * If the pagetables are shared don't copy or take references. * * dst_pte == src_pte is the common case of src/dest sharing. * However, src could have 'unshared' and dst shares with * another vma. So page_count of ptep page is checked instead * to reliably determine whether pte is shared. */ if (page_count(virt_to_page(dst_pte)) > 1) { addr |= last_addr_mask; continue; } dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); again: if (huge_pte_none(entry)) { /* * Skip if src entry none. */ ; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) { if (!userfaultfd_wp(dst_vma)) entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry, sz); } else if (unlikely(is_hugetlb_entry_migration(entry))) { swp_entry_t swp_entry = pte_to_swp_entry(entry); bool uffd_wp = pte_swp_uffd_wp(entry); if (!is_readable_migration_entry(swp_entry) && cow) { /* * COW mappings require pages in both * parent and child to be set to read. */ swp_entry = make_readable_migration_entry( swp_offset(swp_entry)); entry = swp_entry_to_pte(swp_entry); if (userfaultfd_wp(src_vma) && uffd_wp) entry = pte_swp_mkuffd_wp(entry); set_huge_pte_at(src, addr, src_pte, entry, sz); } if (!userfaultfd_wp(dst_vma)) entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry, sz); } else if (unlikely(is_pte_marker(entry))) { pte_marker marker = copy_pte_marker( pte_to_swp_entry(entry), dst_vma); if (marker) set_huge_pte_at(dst, addr, dst_pte, make_pte_marker(marker), sz); } else { entry = huge_ptep_get(src_pte); pte_folio = page_folio(pte_page(entry)); folio_get(pte_folio); /* * Failing to duplicate the anon rmap is a rare case * where we see pinned hugetlb pages while they're * prone to COW. We need to do the COW earlier during * fork. * * When pre-allocating the page or copying data, we * need to be without the pgtable locks since we could * sleep during the process. */ if (!folio_test_anon(pte_folio)) { hugetlb_add_file_rmap(pte_folio); } else if (hugetlb_try_dup_anon_rmap(pte_folio, src_vma)) { pte_t src_pte_old = entry; struct folio *new_folio; spin_unlock(src_ptl); spin_unlock(dst_ptl); /* Do not use reserve as it's private owned */ new_folio = alloc_hugetlb_folio(dst_vma, addr, 1); if (IS_ERR(new_folio)) { folio_put(pte_folio); ret = PTR_ERR(new_folio); break; } ret = copy_user_large_folio(new_folio, pte_folio, addr, dst_vma); folio_put(pte_folio); if (ret) { folio_put(new_folio); break; } /* Install the new hugetlb folio if src pte stable */ dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); if (!pte_same(src_pte_old, entry)) { restore_reserve_on_error(h, dst_vma, addr, new_folio); folio_put(new_folio); /* huge_ptep of dst_pte won't change as in child */ goto again; } hugetlb_install_folio(dst_vma, dst_pte, addr, new_folio, src_pte_old, sz); spin_unlock(src_ptl); spin_unlock(dst_ptl); continue; } if (cow) { /* * No need to notify as we are downgrading page * table protection not changing it to point * to a new page. * * See Documentation/mm/mmu_notifier.rst */ huge_ptep_set_wrprotect(src, addr, src_pte); entry = huge_pte_wrprotect(entry); } if (!userfaultfd_wp(dst_vma)) entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry, sz); hugetlb_count_add(npages, dst); } spin_unlock(src_ptl); spin_unlock(dst_ptl); } if (cow) { raw_write_seqcount_end(&src->write_protect_seq); mmu_notifier_invalidate_range_end(&range); } else { hugetlb_vma_unlock_read(src_vma); } return ret; } static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte, unsigned long sz) { struct hstate *h = hstate_vma(vma); struct mm_struct *mm = vma->vm_mm; spinlock_t *src_ptl, *dst_ptl; pte_t pte; dst_ptl = huge_pte_lock(h, mm, dst_pte); src_ptl = huge_pte_lockptr(h, mm, src_pte); /* * We don't have to worry about the ordering of src and dst ptlocks * because exclusive mmap_lock (or the i_mmap_lock) prevents deadlock. */ if (src_ptl != dst_ptl) spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); pte = huge_ptep_get_and_clear(mm, old_addr, src_pte); set_huge_pte_at(mm, new_addr, dst_pte, pte, sz); if (src_ptl != dst_ptl) spin_unlock(src_ptl); spin_unlock(dst_ptl); } int move_hugetlb_page_tables(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long old_addr, unsigned long new_addr, unsigned long len) { struct hstate *h = hstate_vma(vma); struct address_space *mapping = vma->vm_file->f_mapping; unsigned long sz = huge_page_size(h); struct mm_struct *mm = vma->vm_mm; unsigned long old_end = old_addr + len; unsigned long last_addr_mask; pte_t *src_pte, *dst_pte; struct mmu_notifier_range range; bool shared_pmd = false; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr, old_end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); /* * In case of shared PMDs, we should cover the maximum possible * range. */ flush_cache_range(vma, range.start, range.end); mmu_notifier_invalidate_range_start(&range); last_addr_mask = hugetlb_mask_last_page(h); /* Prevent race with file truncation */ hugetlb_vma_lock_write(vma); i_mmap_lock_write(mapping); for (; old_addr < old_end; old_addr += sz, new_addr += sz) { src_pte = hugetlb_walk(vma, old_addr, sz); if (!src_pte) { old_addr |= last_addr_mask; new_addr |= last_addr_mask; continue; } if (huge_pte_none(huge_ptep_get(src_pte))) continue; if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) { shared_pmd = true; old_addr |= last_addr_mask; new_addr |= last_addr_mask; continue; } dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz); if (!dst_pte) break; move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz); } if (shared_pmd) flush_hugetlb_tlb_range(vma, range.start, range.end); else flush_hugetlb_tlb_range(vma, old_end - len, old_end); mmu_notifier_invalidate_range_end(&range); i_mmap_unlock_write(mapping); hugetlb_vma_unlock_write(vma); return len + old_addr - old_end; } void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page, zap_flags_t zap_flags) { struct mm_struct *mm = vma->vm_mm; unsigned long address; pte_t *ptep; pte_t pte; spinlock_t *ptl; struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); unsigned long last_addr_mask; bool force_flush = false; WARN_ON(!is_vm_hugetlb_page(vma)); BUG_ON(start & ~huge_page_mask(h)); BUG_ON(end & ~huge_page_mask(h)); /* * This is a hugetlb vma, all the pte entries should point * to huge page. */ tlb_change_page_size(tlb, sz); tlb_start_vma(tlb, vma); last_addr_mask = hugetlb_mask_last_page(h); address = start; for (; address < end; address += sz) { ptep = hugetlb_walk(vma, address, sz); if (!ptep) { address |= last_addr_mask; continue; } ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, vma, address, ptep)) { spin_unlock(ptl); tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE); force_flush = true; address |= last_addr_mask; continue; } pte = huge_ptep_get(ptep); if (huge_pte_none(pte)) { spin_unlock(ptl); continue; } /* * Migrating hugepage or HWPoisoned hugepage is already * unmapped and its refcount is dropped, so just clear pte here. */ if (unlikely(!pte_present(pte))) { /* * If the pte was wr-protected by uffd-wp in any of the * swap forms, meanwhile the caller does not want to * drop the uffd-wp bit in this zap, then replace the * pte with a marker. */ if (pte_swp_uffd_wp_any(pte) && !(zap_flags & ZAP_FLAG_DROP_MARKER)) set_huge_pte_at(mm, address, ptep, make_pte_marker(PTE_MARKER_UFFD_WP), sz); else huge_pte_clear(mm, address, ptep, sz); spin_unlock(ptl); continue; } page = pte_page(pte); /* * If a reference page is supplied, it is because a specific * page is being unmapped, not a range. Ensure the page we * are about to unmap is the actual page of interest. */ if (ref_page) { if (page != ref_page) { spin_unlock(ptl); continue; } /* * Mark the VMA as having unmapped its page so that * future faults in this VMA will fail rather than * looking like data was lost */ set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); } pte = huge_ptep_get_and_clear(mm, address, ptep); tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) set_page_dirty(page); /* Leave a uffd-wp pte marker if needed */ if (huge_pte_uffd_wp(pte) && !(zap_flags & ZAP_FLAG_DROP_MARKER)) set_huge_pte_at(mm, address, ptep, make_pte_marker(PTE_MARKER_UFFD_WP), sz); hugetlb_count_sub(pages_per_huge_page(h), mm); hugetlb_remove_rmap(page_folio(page)); spin_unlock(ptl); tlb_remove_page_size(tlb, page, huge_page_size(h)); /* * Bail out after unmapping reference page if supplied */ if (ref_page) break; } tlb_end_vma(tlb, vma); /* * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We * could defer the flush until now, since by holding i_mmap_rwsem we * guaranteed that the last refernece would not be dropped. But we must * do the flushing before we return, as otherwise i_mmap_rwsem will be * dropped and the last reference to the shared PMDs page might be * dropped as well. * * In theory we could defer the freeing of the PMD pages as well, but * huge_pmd_unshare() relies on the exact page_count for the PMD page to * detect sharing, so we cannot defer the release of the page either. * Instead, do flush now. */ if (force_flush) tlb_flush_mmu_tlbonly(tlb); } void __hugetlb_zap_begin(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { if (!vma->vm_file) /* hugetlbfs_file_mmap error */ return; adjust_range_if_pmd_sharing_possible(vma, start, end); hugetlb_vma_lock_write(vma); if (vma->vm_file) i_mmap_lock_write(vma->vm_file->f_mapping); } void __hugetlb_zap_end(struct vm_area_struct *vma, struct zap_details *details) { zap_flags_t zap_flags = details ? details->zap_flags : 0; if (!vma->vm_file) /* hugetlbfs_file_mmap error */ return; if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */ /* * Unlock and free the vma lock before releasing i_mmap_rwsem. * When the vma_lock is freed, this makes the vma ineligible * for pmd sharing. And, i_mmap_rwsem is required to set up * pmd sharing. This is important as page tables for this * unmapped range will be asynchrously deleted. If the page * tables are shared, there will be issues when accessed by * someone else. */ __hugetlb_vma_unlock_write_free(vma); } else { hugetlb_vma_unlock_write(vma); } if (vma->vm_file) i_mmap_unlock_write(vma->vm_file->f_mapping); } void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page, zap_flags_t zap_flags) { struct mmu_notifier_range range; struct mmu_gather tlb; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, start, end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); mmu_notifier_invalidate_range_start(&range); tlb_gather_mmu(&tlb, vma->vm_mm); __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); } /* * This is called when the original mapper is failing to COW a MAP_PRIVATE * mapping it owns the reserve page for. The intention is to unmap the page * from other VMAs and let the children be SIGKILLed if they are faulting the * same region. */ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page, unsigned long address) { struct hstate *h = hstate_vma(vma); struct vm_area_struct *iter_vma; struct address_space *mapping; pgoff_t pgoff; /* * vm_pgoff is in PAGE_SIZE units, hence the different calculation * from page cache lookup which is in HPAGE_SIZE units. */ address = address & huge_page_mask(h); pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; mapping = vma->vm_file->f_mapping; /* * Take the mapping lock for the duration of the table walk. As * this mapping should be shared between all the VMAs, * __unmap_hugepage_range() is called as the lock is already held */ i_mmap_lock_write(mapping); vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { /* Do not unmap the current VMA */ if (iter_vma == vma) continue; /* * Shared VMAs have their own reserves and do not affect * MAP_PRIVATE accounting but it is possible that a shared * VMA is using the same page so check and skip such VMAs. */ if (iter_vma->vm_flags & VM_MAYSHARE) continue; /* * Unmap the page from other VMAs without their own reserves. * They get marked to be SIGKILLed if they fault in these * areas. This is because a future no-page fault on this VMA * could insert a zeroed page instead of the data existing * from the time of fork. This would look like data corruption */ if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) unmap_hugepage_range(iter_vma, address, address + huge_page_size(h), page, 0); } i_mmap_unlock_write(mapping); } /* * hugetlb_wp() should be called with page lock of the original hugepage held. * Called with hugetlb_fault_mutex_table held and pte_page locked so we * cannot race with other handlers or page migration. * Keep the pte_same checks anyway to make transition from the mutex easier. */ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, unsigned int flags, struct folio *pagecache_folio, spinlock_t *ptl) { const bool unshare = flags & FAULT_FLAG_UNSHARE; pte_t pte = huge_ptep_get(ptep); struct hstate *h = hstate_vma(vma); struct folio *old_folio; struct folio *new_folio; int outside_reserve = 0; vm_fault_t ret = 0; unsigned long haddr = address & huge_page_mask(h); struct mmu_notifier_range range; /* * Never handle CoW for uffd-wp protected pages. It should be only * handled when the uffd-wp protection is removed. * * Note that only the CoW optimization path (in hugetlb_no_page()) * can trigger this, because hugetlb_fault() will always resolve * uffd-wp bit first. */ if (!unshare && huge_pte_uffd_wp(pte)) return 0; /* * hugetlb does not support FOLL_FORCE-style write faults that keep the * PTE mapped R/O such as maybe_mkwrite() would do. */ if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE))) return VM_FAULT_SIGSEGV; /* Let's take out MAP_SHARED mappings first. */ if (vma->vm_flags & VM_MAYSHARE) { set_huge_ptep_writable(vma, haddr, ptep); return 0; } old_folio = page_folio(pte_page(pte)); delayacct_wpcopy_start(); retry_avoidcopy: /* * If no-one else is actually using this page, we're the exclusive * owner and can reuse this page. */ if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) { if (!PageAnonExclusive(&old_folio->page)) { folio_move_anon_rmap(old_folio, vma); SetPageAnonExclusive(&old_folio->page); } if (likely(!unshare)) set_huge_ptep_writable(vma, haddr, ptep); delayacct_wpcopy_end(); return 0; } VM_BUG_ON_PAGE(folio_test_anon(old_folio) && PageAnonExclusive(&old_folio->page), &old_folio->page); /* * If the process that created a MAP_PRIVATE mapping is about to * perform a COW due to a shared page count, attempt to satisfy * the allocation without using the existing reserves. The pagecache * page is used to determine if the reserve at this address was * consumed or not. If reserves were used, a partial faulted mapping * at the time of fork() could consume its reserves on COW instead * of the full address range. */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && old_folio != pagecache_folio) outside_reserve = 1; folio_get(old_folio); /* * Drop page table lock as buddy allocator may be called. It will * be acquired again before returning to the caller, as expected. */ spin_unlock(ptl); new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve); if (IS_ERR(new_folio)) { /* * If a process owning a MAP_PRIVATE mapping fails to COW, * it is due to references held by a child and an insufficient * huge page pool. To guarantee the original mappers * reliability, unmap the page from child processes. The child * may get SIGKILLed if it later faults. */ if (outside_reserve) { struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx; u32 hash; folio_put(old_folio); /* * Drop hugetlb_fault_mutex and vma_lock before * unmapping. unmapping needs to hold vma_lock * in write mode. Dropping vma_lock in read mode * here is OK as COW mappings do not interact with * PMD sharing. * * Reacquire both after unmap operation. */ idx = vma_hugecache_offset(h, vma, haddr); hash = hugetlb_fault_mutex_hash(mapping, idx); hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); unmap_ref_private(mm, vma, &old_folio->page, haddr); mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vma_lock_read(vma); spin_lock(ptl); ptep = hugetlb_walk(vma, haddr, huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) goto retry_avoidcopy; /* * race occurs while re-acquiring page table * lock, and our job is done. */ delayacct_wpcopy_end(); return 0; } ret = vmf_error(PTR_ERR(new_folio)); goto out_release_old; } /* * When the original hugepage is shared one, it does not have * anon_vma prepared. */ if (unlikely(anon_vma_prepare(vma))) { ret = VM_FAULT_OOM; goto out_release_all; } if (copy_user_large_folio(new_folio, old_folio, address, vma)) { ret = VM_FAULT_HWPOISON_LARGE; goto out_release_all; } __folio_mark_uptodate(new_folio); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr, haddr + huge_page_size(h)); mmu_notifier_invalidate_range_start(&range); /* * Retake the page table lock to check for racing updates * before the page tables are altered */ spin_lock(ptl); ptep = hugetlb_walk(vma, haddr, huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare); /* Break COW or unshare */ huge_ptep_clear_flush(vma, haddr, ptep); hugetlb_remove_rmap(old_folio); hugetlb_add_new_anon_rmap(new_folio, vma, haddr); if (huge_pte_uffd_wp(pte)) newpte = huge_pte_mkuffd_wp(newpte); set_huge_pte_at(mm, haddr, ptep, newpte, huge_page_size(h)); folio_set_hugetlb_migratable(new_folio); /* Make the old page be freed below */ new_folio = old_folio; } spin_unlock(ptl); mmu_notifier_invalidate_range_end(&range); out_release_all: /* * No restore in case of successful pagetable update (Break COW or * unshare) */ if (new_folio != old_folio) restore_reserve_on_error(h, vma, haddr, new_folio); folio_put(new_folio); out_release_old: folio_put(old_folio); spin_lock(ptl); /* Caller expects lock to be held */ delayacct_wpcopy_end(); return ret; } /* * Return whether there is a pagecache page to back given address within VMA. */ static bool hugetlbfs_pagecache_present(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx = linear_page_index(vma, address); struct folio *folio; folio = filemap_get_folio(mapping, idx); if (IS_ERR(folio)) return false; folio_put(folio); return true; } int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx) { struct inode *inode = mapping->host; struct hstate *h = hstate_inode(inode); int err; idx <<= huge_page_order(h); __folio_set_locked(folio); err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL); if (unlikely(err)) { __folio_clear_locked(folio); return err; } folio_clear_hugetlb_restore_reserve(folio); /* * mark folio dirty so that it will not be removed from cache/file * by non-hugetlbfs specific code paths. */ folio_mark_dirty(folio); spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock); return 0; } static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, unsigned int flags, unsigned long haddr, unsigned long addr, unsigned long reason) { u32 hash; struct vm_fault vmf = { .vma = vma, .address = haddr, .real_address = addr, .flags = flags, /* * Hard to debug if it ends up being * used by a callee that assumes * something about the other * uninitialized fields... same as in * memory.c */ }; /* * vma_lock and hugetlb_fault_mutex must be dropped before handling * userfault. Also mmap_lock could be dropped due to handling * userfault, any vma operation should be careful from here. */ hugetlb_vma_unlock_read(vma); hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return handle_userfault(&vmf, reason); } /* * Recheck pte with pgtable lock. Returns true if pte didn't change, or * false if pte changed or is changing. */ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, pte_t *ptep, pte_t old_pte) { spinlock_t *ptl; bool same; ptl = huge_pte_lock(h, mm, ptep); same = pte_same(huge_ptep_get(ptep), old_pte); spin_unlock(ptl); return same; } static vm_fault_t hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, unsigned long address, pte_t *ptep, pte_t old_pte, unsigned int flags) { struct hstate *h = hstate_vma(vma); vm_fault_t ret = VM_FAULT_SIGBUS; int anon_rmap = 0; unsigned long size; struct folio *folio; pte_t new_pte; spinlock_t *ptl; unsigned long haddr = address & huge_page_mask(h); bool new_folio, new_pagecache_folio = false; u32 hash = hugetlb_fault_mutex_hash(mapping, idx); /* * Currently, we are forced to kill the process in the event the * original mapper has unmapped pages from the child due to a failed * COW/unsharing. Warn that such a situation has occurred as it may not * be obvious. */ if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", current->pid); goto out; } /* * Use page lock to guard against racing truncation * before we get page_table_lock. */ new_folio = false; folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(folio)) { size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto out; /* Check for page in userfault range */ if (userfaultfd_missing(vma)) { /* * Since hugetlb_no_page() was examining pte * without pgtable lock, we need to re-test under * lock because the pte may not be stable and could * have changed from under us. Try to detect * either changed or during-changing ptes and retry * properly when needed. * * Note that userfaultfd is actually fine with * false positives (e.g. caused by pte changed), * but not wrong logical events (e.g. caused by * reading a pte during changing). The latter can * confuse the userspace, so the strictness is very * much preferred. E.g., MISSING event should * never happen on the page after UFFDIO_COPY has * correctly installed the page and returned. */ if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { ret = 0; goto out; } return hugetlb_handle_userfault(vma, mapping, idx, flags, haddr, address, VM_UFFD_MISSING); } folio = alloc_hugetlb_folio(vma, haddr, 0); if (IS_ERR(folio)) { /* * Returning error will result in faulting task being * sent SIGBUS. The hugetlb fault mutex prevents two * tasks from racing to fault in the same page which * could result in false unable to allocate errors. * Page migration does not take the fault mutex, but * does a clear then write of pte's under page table * lock. Page fault code could race with migration, * notice the clear pte and try to allocate a page * here. Before returning error, get ptl and make * sure there really is no pte entry. */ if (hugetlb_pte_stable(h, mm, ptep, old_pte)) ret = vmf_error(PTR_ERR(folio)); else ret = 0; goto out; } clear_huge_page(&folio->page, address, pages_per_huge_page(h)); __folio_mark_uptodate(folio); new_folio = true; if (vma->vm_flags & VM_MAYSHARE) { int err = hugetlb_add_to_page_cache(folio, mapping, idx); if (err) { /* * err can't be -EEXIST which implies someone * else consumed the reservation since hugetlb * fault mutex is held when add a hugetlb page * to the page cache. So it's safe to call * restore_reserve_on_error() here. */ restore_reserve_on_error(h, vma, haddr, folio); folio_put(folio); goto out; } new_pagecache_folio = true; } else { folio_lock(folio); if (unlikely(anon_vma_prepare(vma))) { ret = VM_FAULT_OOM; goto backout_unlocked; } anon_rmap = 1; } } else { /* * If memory error occurs between mmap() and fault, some process * don't have hwpoisoned swap entry for errored virtual address. * So we need to block hugepage fault by PG_hwpoison bit check. */ if (unlikely(folio_test_hwpoison(folio))) { ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto backout_unlocked; } /* Check for page in userfault range. */ if (userfaultfd_minor(vma)) { folio_unlock(folio); folio_put(folio); /* See comment in userfaultfd_missing() block above */ if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { ret = 0; goto out; } return hugetlb_handle_userfault(vma, mapping, idx, flags, haddr, address, VM_UFFD_MINOR); } } /* * If we are going to COW a private mapping later, we examine the * pending reservations for this page now. This will ensure that * any allocations necessary to record that reservation occur outside * the spinlock. */ if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { if (vma_needs_reservation(h, vma, haddr) < 0) { ret = VM_FAULT_OOM; goto backout_unlocked; } /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, haddr); } ptl = huge_pte_lock(h, mm, ptep); ret = 0; /* If pte changed from under us, retry */ if (!pte_same(huge_ptep_get(ptep), old_pte)) goto backout; if (anon_rmap) hugetlb_add_new_anon_rmap(folio, vma, haddr); else hugetlb_add_file_rmap(folio); new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); /* * If this pte was previously wr-protected, keep it wr-protected even * if populated. */ if (unlikely(pte_marker_uffd_wp(old_pte))) new_pte = huge_pte_mkuffd_wp(new_pte); set_huge_pte_at(mm, haddr, ptep, new_pte, huge_page_size(h)); hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl); } spin_unlock(ptl); /* * Only set hugetlb_migratable in newly allocated pages. Existing pages * found in the pagecache may not have hugetlb_migratable if they have * been isolated for migration. */ if (new_folio) folio_set_hugetlb_migratable(folio); folio_unlock(folio); out: hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return ret; backout: spin_unlock(ptl); backout_unlocked: if (new_folio && !new_pagecache_folio) restore_reserve_on_error(h, vma, haddr, folio); folio_unlock(folio); folio_put(folio); goto out; } #ifdef CONFIG_SMP u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) { unsigned long key[2]; u32 hash; key[0] = (unsigned long) mapping; key[1] = idx; hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0); return hash & (num_fault_mutexes - 1); } #else /* * For uniprocessor systems we always use a single mutex, so just * return 0 and avoid the hashing overhead. */ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) { return 0; } #endif vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { pte_t *ptep, entry; spinlock_t *ptl; vm_fault_t ret; u32 hash; pgoff_t idx; struct folio *folio = NULL; struct folio *pagecache_folio = NULL; struct hstate *h = hstate_vma(vma); struct address_space *mapping; int need_wait_lock = 0; unsigned long haddr = address & huge_page_mask(h); /* TODO: Handle faults under the VMA lock */ if (flags & FAULT_FLAG_VMA_LOCK) { vma_end_read(vma); return VM_FAULT_RETRY; } /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ mapping = vma->vm_file->f_mapping; idx = vma_hugecache_offset(h, vma, haddr); hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* * Acquire vma lock before calling huge_pte_alloc and hold * until finished with ptep. This prevents huge_pmd_unshare from * being called elsewhere and making the ptep no longer valid. */ hugetlb_vma_lock_read(vma); ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); if (!ptep) { hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return VM_FAULT_OOM; } entry = huge_ptep_get(ptep); if (huge_pte_none_mostly(entry)) { if (is_pte_marker(entry)) { pte_marker marker = pte_marker_get(pte_to_swp_entry(entry)); if (marker & PTE_MARKER_POISONED) { ret = VM_FAULT_HWPOISON_LARGE; goto out_mutex; } } /* * Other PTE markers should be handled the same way as none PTE. * * hugetlb_no_page will drop vma lock and hugetlb fault * mutex internally, which make us return immediately. */ return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, entry, flags); } ret = 0; /* * entry could be a migration/hwpoison entry at this point, so this * check prevents the kernel from going below assuming that we have * an active hugepage in pagecache. This goto expects the 2nd page * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will * properly handle it. */ if (!pte_present(entry)) { if (unlikely(is_hugetlb_entry_migration(entry))) { /* * Release the hugetlb fault lock now, but retain * the vma lock, because it is needed to guard the * huge_pte_lockptr() later in * migration_entry_wait_huge(). The vma lock will * be released there. */ mutex_unlock(&hugetlb_fault_mutex_table[hash]); migration_entry_wait_huge(vma, ptep); return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_mutex; } /* * If we are going to COW/unshare the mapping later, we examine the * pending reservations for this page now. This will ensure that any * allocations necessary to record that reservation occur outside the * spinlock. Also lookup the pagecache page now as it is used to * determine if a reservation has been consumed. */ if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) { if (vma_needs_reservation(h, vma, haddr) < 0) { ret = VM_FAULT_OOM; goto out_mutex; } /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, haddr); pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(pagecache_folio)) pagecache_folio = NULL; } ptl = huge_pte_lock(h, mm, ptep); /* Check for a racing update before calling hugetlb_wp() */ if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) goto out_ptl; /* Handle userfault-wp first, before trying to lock more pages */ if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) && (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { if (!userfaultfd_wp_async(vma)) { struct vm_fault vmf = { .vma = vma, .address = haddr, .real_address = address, .flags = flags, }; spin_unlock(ptl); if (pagecache_folio) { folio_unlock(pagecache_folio); folio_put(pagecache_folio); } hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return handle_userfault(&vmf, VM_UFFD_WP); } entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(mm, haddr, ptep, entry, huge_page_size(hstate_vma(vma))); /* Fallthrough to CoW */ } /* * hugetlb_wp() requires page locks of pte_page(entry) and * pagecache_folio, so here we need take the former one * when folio != pagecache_folio or !pagecache_folio. */ folio = page_folio(pte_page(entry)); if (folio != pagecache_folio) if (!folio_trylock(folio)) { need_wait_lock = 1; goto out_ptl; } folio_get(folio); if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!huge_pte_write(entry)) { ret = hugetlb_wp(mm, vma, address, ptep, flags, pagecache_folio, ptl); goto out_put_page; } else if (likely(flags & FAULT_FLAG_WRITE)) { entry = huge_pte_mkdirty(entry); } } entry = pte_mkyoung(entry); if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, haddr, ptep); out_put_page: if (folio != pagecache_folio) folio_unlock(folio); folio_put(folio); out_ptl: spin_unlock(ptl); if (pagecache_folio) { folio_unlock(pagecache_folio); folio_put(pagecache_folio); } out_mutex: hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); /* * Generally it's safe to hold refcount during waiting page lock. But * here we just wait to defer the next page fault to avoid busy loop and * the page is not used after unlocked before returning from the current * page fault. So we are safe from accessing freed page, even if we wait * here without taking refcount. */ if (need_wait_lock) folio_wait_locked(folio); return ret; } #ifdef CONFIG_USERFAULTFD /* * Can probably be eliminated, but still used by hugetlb_mfill_atomic_pte(). */ static struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { struct mempolicy *mpol; nodemask_t *nodemask; struct folio *folio; gfp_t gfp_mask; int node; gfp_mask = htlb_alloc_mask(h); node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); mpol_cond_put(mpol); return folio; } /* * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte * with modifications for hugetlb pages. */ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, struct folio **foliop) { struct mm_struct *dst_mm = dst_vma->vm_mm; bool is_continue = uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE); bool wp_enabled = (flags & MFILL_ATOMIC_WP); struct hstate *h = hstate_vma(dst_vma); struct address_space *mapping = dst_vma->vm_file->f_mapping; pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); unsigned long size; int vm_shared = dst_vma->vm_flags & VM_SHARED; pte_t _dst_pte; spinlock_t *ptl; int ret = -ENOMEM; struct folio *folio; int writable; bool folio_in_pagecache = false; if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { ptl = huge_pte_lock(h, dst_mm, dst_pte); /* Don't overwrite any existing PTEs (even markers) */ if (!huge_pte_none(huge_ptep_get(dst_pte))) { spin_unlock(ptl); return -EEXIST; } _dst_pte = make_pte_marker(PTE_MARKER_POISONED); set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, huge_page_size(h)); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); spin_unlock(ptl); return 0; } if (is_continue) { ret = -EFAULT; folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(folio)) goto out; folio_in_pagecache = true; } else if (!*foliop) { /* If a folio already exists, then it's UFFDIO_COPY for * a non-missing case. Return -EEXIST. */ if (vm_shared && hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { ret = -EEXIST; goto out; } folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); if (IS_ERR(folio)) { ret = -ENOMEM; goto out; } ret = copy_folio_from_user(folio, (const void __user *) src_addr, false); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { ret = -ENOENT; /* Free the allocated folio which may have * consumed a reservation. */ restore_reserve_on_error(h, dst_vma, dst_addr, folio); folio_put(folio); /* Allocate a temporary folio to hold the copied * contents. */ folio = alloc_hugetlb_folio_vma(h, dst_vma, dst_addr); if (!folio) { ret = -ENOMEM; goto out; } *foliop = folio; /* Set the outparam foliop and return to the caller to * copy the contents outside the lock. Don't free the * folio. */ goto out; } } else { if (vm_shared && hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { folio_put(*foliop); ret = -EEXIST; *foliop = NULL; goto out; } folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); if (IS_ERR(folio)) { folio_put(*foliop); ret = -ENOMEM; *foliop = NULL; goto out; } ret = copy_user_large_folio(folio, *foliop, dst_addr, dst_vma); folio_put(*foliop); *foliop = NULL; if (ret) { folio_put(folio); goto out; } } /* * The memory barrier inside __folio_mark_uptodate makes sure that * preceding stores to the page contents become visible before * the set_pte_at() write. */ __folio_mark_uptodate(folio); /* Add shared, newly allocated pages to the page cache. */ if (vm_shared && !is_continue) { size = i_size_read(mapping->host) >> huge_page_shift(h); ret = -EFAULT; if (idx >= size) goto out_release_nounlock; /* * Serialization between remove_inode_hugepages() and * hugetlb_add_to_page_cache() below happens through the * hugetlb_fault_mutex_table that here must be hold by * the caller. */ ret = hugetlb_add_to_page_cache(folio, mapping, idx); if (ret) goto out_release_nounlock; folio_in_pagecache = true; } ptl = huge_pte_lock(h, dst_mm, dst_pte); ret = -EIO; if (folio_test_hwpoison(folio)) goto out_release_unlock; /* * We allow to overwrite a pte marker: consider when both MISSING|WP * registered, we firstly wr-protect a none pte which has no page cache * page backing it, then access the page. */ ret = -EEXIST; if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) goto out_release_unlock; if (folio_in_pagecache) hugetlb_add_file_rmap(folio); else hugetlb_add_new_anon_rmap(folio, dst_vma, dst_addr); /* * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY * with wp flag set, don't set pte write bit. */ if (wp_enabled || (is_continue && !vm_shared)) writable = 0; else writable = dst_vma->vm_flags & VM_WRITE; _dst_pte = make_huge_pte(dst_vma, &folio->page, writable); /* * Always mark UFFDIO_COPY page dirty; note that this may not be * extremely important for hugetlbfs for now since swapping is not * supported, but we should still be clear in that this page cannot be * thrown away at will, even if write bit not set. */ _dst_pte = huge_pte_mkdirty(_dst_pte); _dst_pte = pte_mkyoung(_dst_pte); if (wp_enabled) _dst_pte = huge_pte_mkuffd_wp(_dst_pte); set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, huge_page_size(h)); hugetlb_count_add(pages_per_huge_page(h), dst_mm); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); spin_unlock(ptl); if (!is_continue) folio_set_hugetlb_migratable(folio); if (vm_shared || is_continue) folio_unlock(folio); ret = 0; out: return ret; out_release_unlock: spin_unlock(ptl); if (vm_shared || is_continue) folio_unlock(folio); out_release_nounlock: if (!folio_in_pagecache) restore_reserve_on_error(h, dst_vma, dst_addr, folio); folio_put(folio); goto out; } #endif /* CONFIG_USERFAULTFD */ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, unsigned int *page_mask) { struct hstate *h = hstate_vma(vma); struct mm_struct *mm = vma->vm_mm; unsigned long haddr = address & huge_page_mask(h); struct page *page = NULL; spinlock_t *ptl; pte_t *pte, entry; int ret; hugetlb_vma_lock_read(vma); pte = hugetlb_walk(vma, haddr, huge_page_size(h)); if (!pte) goto out_unlock; ptl = huge_pte_lock(h, mm, pte); entry = huge_ptep_get(pte); if (pte_present(entry)) { page = pte_page(entry); if (!huge_pte_write(entry)) { if (flags & FOLL_WRITE) { page = NULL; goto out; } if (gup_must_unshare(vma, flags, page)) { /* Tell the caller to do unsharing */ page = ERR_PTR(-EMLINK); goto out; } } page = nth_page(page, ((address & ~huge_page_mask(h)) >> PAGE_SHIFT)); /* * Note that page may be a sub-page, and with vmemmap * optimizations the page struct may be read only. * try_grab_page() will increase the ref count on the * head page, so this will be OK. * * try_grab_page() should always be able to get the page here, * because we hold the ptl lock and have verified pte_present(). */ ret = try_grab_page(page, flags); if (WARN_ON_ONCE(ret)) { page = ERR_PTR(ret); goto out; } *page_mask = (1U << huge_page_order(h)) - 1; } out: spin_unlock(ptl); out_unlock: hugetlb_vma_unlock_read(vma); /* * Fixup retval for dump requests: if pagecache doesn't exist, * don't try to allocate a new page but just skip it. */ if (!page && (flags & FOLL_DUMP) && !hugetlbfs_pagecache_present(h, vma, address)) page = ERR_PTR(-EFAULT); return page; } long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { struct mm_struct *mm = vma->vm_mm; unsigned long start = address; pte_t *ptep; pte_t pte; struct hstate *h = hstate_vma(vma); long pages = 0, psize = huge_page_size(h); bool shared_pmd = false; struct mmu_notifier_range range; unsigned long last_addr_mask; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; /* * In the case of shared PMDs, the area to flush could be beyond * start/end. Set range.start/range.end to cover the maximum possible * range if PMD sharing is possible. */ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, mm, start, end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); BUG_ON(address >= end); flush_cache_range(vma, range.start, range.end); mmu_notifier_invalidate_range_start(&range); hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); last_addr_mask = hugetlb_mask_last_page(h); for (; address < end; address += psize) { spinlock_t *ptl; ptep = hugetlb_walk(vma, address, psize); if (!ptep) { if (!uffd_wp) { address |= last_addr_mask; continue; } /* * Userfaultfd wr-protect requires pgtable * pre-allocations to install pte markers. */ ptep = huge_pte_alloc(mm, vma, address, psize); if (!ptep) { pages = -ENOMEM; break; } } ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, vma, address, ptep)) { /* * When uffd-wp is enabled on the vma, unshare * shouldn't happen at all. Warn about it if it * happened due to some reason. */ WARN_ON_ONCE(uffd_wp || uffd_wp_resolve); pages++; spin_unlock(ptl); shared_pmd = true; address |= last_addr_mask; continue; } pte = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { /* Nothing to do. */ } else if (unlikely(is_hugetlb_entry_migration(pte))) { swp_entry_t entry = pte_to_swp_entry(pte); struct page *page = pfn_swap_entry_to_page(entry); pte_t newpte = pte; if (is_writable_migration_entry(entry)) { if (PageAnon(page)) entry = make_readable_exclusive_migration_entry( swp_offset(entry)); else entry = make_readable_migration_entry( swp_offset(entry)); newpte = swp_entry_to_pte(entry); pages++; } if (uffd_wp) newpte = pte_swp_mkuffd_wp(newpte); else if (uffd_wp_resolve) newpte = pte_swp_clear_uffd_wp(newpte); if (!pte_same(pte, newpte)) set_huge_pte_at(mm, address, ptep, newpte, psize); } else if (unlikely(is_pte_marker(pte))) { /* No other markers apply for now. */ WARN_ON_ONCE(!pte_marker_uffd_wp(pte)); if (uffd_wp_resolve) /* Safe to modify directly (non-present->none). */ huge_pte_clear(mm, address, ptep, psize); } else if (!huge_pte_none(pte)) { pte_t old_pte; unsigned int shift = huge_page_shift(hstate_vma(vma)); old_pte = huge_ptep_modify_prot_start(vma, address, ptep); pte = huge_pte_modify(old_pte, newprot); pte = arch_make_huge_pte(pte, shift, vma->vm_flags); if (uffd_wp) pte = huge_pte_mkuffd_wp(pte); else if (uffd_wp_resolve) pte = huge_pte_clear_uffd_wp(pte); huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; } else { /* None pte */ if (unlikely(uffd_wp)) /* Safe to modify directly (none->non-present). */ set_huge_pte_at(mm, address, ptep, make_pte_marker(PTE_MARKER_UFFD_WP), psize); } spin_unlock(ptl); } /* * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare * may have cleared our pud entry and done put_page on the page table: * once we release i_mmap_rwsem, another task can do the final put_page * and that page table be reused and filled with junk. If we actually * did unshare a page of pmds, flush the range corresponding to the pud. */ if (shared_pmd) flush_hugetlb_tlb_range(vma, range.start, range.end); else flush_hugetlb_tlb_range(vma, start, end); /* * No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are * downgrading page table protection not changing it to point to a new * page. * * See Documentation/mm/mmu_notifier.rst */ i_mmap_unlock_write(vma->vm_file->f_mapping); hugetlb_vma_unlock_write(vma); mmu_notifier_invalidate_range_end(&range); return pages > 0 ? (pages << h->order) : pages; } /* Return true if reservation was successful, false otherwise. */ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, vm_flags_t vm_flags) { long chg = -1, add = -1; struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; struct hugetlb_cgroup *h_cg = NULL; long gbl_reserve, regions_needed = 0; /* This should never happen */ if (from > to) { VM_WARN(1, "%s called with a negative range\n", __func__); return false; } /* * vma specific semaphore used for pmd sharing and fault/truncation * synchronization */ hugetlb_vma_lock_alloc(vma); /* * Only apply hugepage reservation if asked. At fault time, an * attempt will be made for VM_NORESERVE to allocate a page * without using reserves */ if (vm_flags & VM_NORESERVE) return true; /* * Shared mappings base their reservation on the number of pages that * are already allocated on behalf of the file. Private mappings need * to reserve the full area even if read-only as mprotect() may be * called to make the mapping read-write. Assume !vma is a shm mapping */ if (!vma || vma->vm_flags & VM_MAYSHARE) { /* * resv_map can not be NULL as hugetlb_reserve_pages is only * called for inodes for which resv_maps were created (see * hugetlbfs_get_inode). */ resv_map = inode_resv_map(inode); chg = region_chg(resv_map, from, to, &regions_needed); } else { /* Private mapping. */ resv_map = resv_map_alloc(); if (!resv_map) goto out_err; chg = to - from; set_vma_resv_map(vma, resv_map); set_vma_resv_flags(vma, HPAGE_RESV_OWNER); } if (chg < 0) goto out_err; if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), &h_cg) < 0) goto out_err; if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) { /* For private mappings, the hugetlb_cgroup uncharge info hangs * of the resv_map. */ resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h); } /* * There must be enough pages in the subpool for the mapping. If * the subpool has a minimum size, there may be some global * reservations already in place (gbl_reserve). */ gbl_reserve = hugepage_subpool_get_pages(spool, chg); if (gbl_reserve < 0) goto out_uncharge_cgroup; /* * Check enough hugepages are available for the reservation. * Hand the pages back to the subpool if there are not */ if (hugetlb_acct_memory(h, gbl_reserve) < 0) goto out_put_pages; /* * Account for the reservations made. Shared mappings record regions * that have reservations as they are shared by multiple VMAs. * When the last VMA disappears, the region map says how much * the reservation was and the page cache tells how much of * the reservation was consumed. Private mappings are per-VMA and * only the consumed reservations are tracked. When the VMA * disappears, the original reservation is the VMA size and the * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ if (!vma || vma->vm_flags & VM_MAYSHARE) { add = region_add(resv_map, from, to, regions_needed, h, h_cg); if (unlikely(add < 0)) { hugetlb_acct_memory(h, -gbl_reserve); goto out_put_pages; } else if (unlikely(chg > add)) { /* * pages in this range were added to the reserve * map between region_chg and region_add. This * indicates a race with alloc_hugetlb_folio. Adjust * the subpool and reserve counts modified above * based on the difference. */ long rsv_adjust; /* * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the * reference to h_cg->css. See comment below for detail. */ hugetlb_cgroup_uncharge_cgroup_rsvd( hstate_index(h), (chg - add) * pages_per_huge_page(h), h_cg); rsv_adjust = hugepage_subpool_put_pages(spool, chg - add); hugetlb_acct_memory(h, -rsv_adjust); } else if (h_cg) { /* * The file_regions will hold their own reference to * h_cg->css. So we should release the reference held * via hugetlb_cgroup_charge_cgroup_rsvd() when we are * done. */ hugetlb_cgroup_put_rsvd_cgroup(h_cg); } } return true; out_put_pages: /* put back original number of pages, chg */ (void)hugepage_subpool_put_pages(spool, chg); out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: hugetlb_vma_lock_free(vma); if (!vma || vma->vm_flags & VM_MAYSHARE) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. */ if (chg >= 0 && add < 0) region_abort(resv_map, from, to, regions_needed); if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { kref_put(&resv_map->refs, resv_map_release); set_vma_resv_map(vma, NULL); } return false; } long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed) { struct hstate *h = hstate_inode(inode); struct resv_map *resv_map = inode_resv_map(inode); long chg = 0; struct hugepage_subpool *spool = subpool_inode(inode); long gbl_reserve; /* * Since this routine can be called in the evict inode path for all * hugetlbfs inodes, resv_map could be NULL. */ if (resv_map) { chg = region_del(resv_map, start, end); /* * region_del() can fail in the rare case where a region * must be split and another region descriptor can not be * allocated. If end == LONG_MAX, it will not fail. */ if (chg < 0) return chg; } spin_lock(&inode->i_lock); inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); /* * If the subpool has a minimum size, the number of global * reservations to be released may be adjusted. * * Note that !resv_map implies freed == 0. So (chg - freed) * won't go negative. */ gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); hugetlb_acct_memory(h, -gbl_reserve); return 0; } #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE static unsigned long page_table_shareable(struct vm_area_struct *svma, struct vm_area_struct *vma, unsigned long addr, pgoff_t idx) { unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + svma->vm_start; unsigned long sbase = saddr & PUD_MASK; unsigned long s_end = sbase + PUD_SIZE; /* Allow segments to share if only one is marked locked */ unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK; unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK; /* * match the virtual addresses, permission and the alignment of the * page table page. * * Also, vma_lock (vm_private_data) is required for sharing. */ if (pmd_index(addr) != pmd_index(saddr) || vm_flags != svm_flags || !range_in_vma(svma, sbase, s_end) || !svma->vm_private_data) return 0; return saddr; } bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) { unsigned long start = addr & PUD_MASK; unsigned long end = start + PUD_SIZE; #ifdef CONFIG_USERFAULTFD if (uffd_disable_huge_pmd_share(vma)) return false; #endif /* * check on proper vm_flags and page table alignment */ if (!(vma->vm_flags & VM_MAYSHARE)) return false; if (!vma->vm_private_data) /* vma lock required for sharing */ return false; if (!range_in_vma(vma, start, end)) return false; return true; } /* * Determine if start,end range within vma could be mapped by shared pmd. * If yes, adjust start and end to cover range associated with possible * shared pmd mappings. */ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE), v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); /* * vma needs to span at least one aligned PUD size, and the range * must be at least partially within in. */ if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) || (*end <= v_start) || (*start >= v_end)) return; /* Extend the range to be PUD aligned for a worst case scenario */ if (*start > v_start) *start = ALIGN_DOWN(*start, PUD_SIZE); if (*end < v_end) *end = ALIGN(*end, PUD_SIZE); } /* * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() * and returns the corresponding pte. While this is not necessary for the * !shared pmd case because we can allocate the pmd later as well, it makes the * code much cleaner. pmd allocation is essential for the shared case because * pud has to be populated inside the same i_mmap_rwsem section - otherwise * racing tasks could either miss the sharing (see huge_pte_offset) or select a * bad pmd for sharing. */ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; struct vm_area_struct *svma; unsigned long saddr; pte_t *spte = NULL; pte_t *pte; i_mmap_lock_read(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; saddr = page_table_shareable(svma, vma, addr, idx); if (saddr) { spte = hugetlb_walk(svma, saddr, vma_mmu_pagesize(svma)); if (spte) { get_page(virt_to_page(spte)); break; } } } if (!spte) goto out; spin_lock(&mm->page_table_lock); if (pud_none(*pud)) { pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); mm_inc_nr_pmds(mm); } else { put_page(virt_to_page(spte)); } spin_unlock(&mm->page_table_lock); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); i_mmap_unlock_read(mapping); return pte; } /* * unmap huge page backed by shared pte. * * Hugetlb pte page is ref counted at the time of mapping. If pte is shared * indicated by page_count > 1, unmap is achieved by clearing pud and * decrementing the ref count. If count == 1, the pte page is not shared. * * Called with page table lock held. * * returns: 1 successfully unmapped a shared pte page * 0 the underlying pte page is not shared, or it is the last user */ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pgd_t *pgd = pgd_offset(mm, addr); p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud = pud_offset(p4d, addr); i_mmap_assert_write_locked(vma->vm_file->f_mapping); hugetlb_vma_assert_locked(vma); BUG_ON(page_count(virt_to_page(ptep)) == 0); if (page_count(virt_to_page(ptep)) == 1) return 0; pud_clear(pud); put_page(virt_to_page(ptep)); mm_dec_nr_pmds(mm); return 1; } #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { return NULL; } int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return 0; } void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { } bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) { return false; } #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pte_t *pte = NULL; pgd = pgd_offset(mm, addr); p4d = p4d_alloc(mm, pgd, addr); if (!p4d) return NULL; pud = pud_alloc(mm, p4d, addr); if (pud) { if (sz == PUD_SIZE) { pte = (pte_t *)pud; } else { BUG_ON(sz != PMD_SIZE); if (want_pmd_share(vma, addr) && pud_none(*pud)) pte = huge_pmd_share(mm, vma, addr, pud); else pte = (pte_t *)pmd_alloc(mm, pud, addr); } } if (pte) { pte_t pteval = ptep_get_lockless(pte); BUG_ON(pte_present(pteval) && !pte_huge(pteval)); } return pte; } /* * huge_pte_offset() - Walk the page table to resolve the hugepage * entry at address @addr * * Return: Pointer to page table entry (PUD or PMD) for * address @addr, or NULL if a !p*d_present() entry is encountered and the * size @sz doesn't match the hugepage size at this level of the page * table. */ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgd = pgd_offset(mm, addr); if (!pgd_present(*pgd)) return NULL; p4d = p4d_offset(pgd, addr); if (!p4d_present(*p4d)) return NULL; pud = pud_offset(p4d, addr); if (sz == PUD_SIZE) /* must be pud huge, non-present or none */ return (pte_t *)pud; if (!pud_present(*pud)) return NULL; /* must have a valid entry and size to go further */ pmd = pmd_offset(pud, addr); /* must be pmd huge, non-present or none */ return (pte_t *)pmd; } /* * Return a mask that can be used to update an address to the last huge * page in a page table page mapping size. Used to skip non-present * page table entries when linearly scanning address ranges. Architectures * with unique huge page to page table relationships can define their own * version of this routine. */ unsigned long hugetlb_mask_last_page(struct hstate *h) { unsigned long hp_size = huge_page_size(h); if (hp_size == PUD_SIZE) return P4D_SIZE - PUD_SIZE; else if (hp_size == PMD_SIZE) return PUD_SIZE - PMD_SIZE; else return 0UL; } #else /* See description above. Architectures can provide their own version. */ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) { #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE if (huge_page_size(h) == PMD_SIZE) return PUD_SIZE - PMD_SIZE; #endif return 0UL; } #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ /* * These functions are overwritable if your architecture needs its own * behavior. */ bool isolate_hugetlb(struct folio *folio, struct list_head *list) { bool ret = true; spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(folio) || !folio_test_hugetlb_migratable(folio) || !folio_try_get(folio)) { ret = false; goto unlock; } folio_clear_hugetlb_migratable(folio); list_move_tail(&folio->lru, list); unlock: spin_unlock_irq(&hugetlb_lock); return ret; } int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison) { int ret = 0; *hugetlb = false; spin_lock_irq(&hugetlb_lock); if (folio_test_hugetlb(folio)) { *hugetlb = true; if (folio_test_hugetlb_freed(folio)) ret = 0; else if (folio_test_hugetlb_migratable(folio) || unpoison) ret = folio_try_get(folio); else ret = -EBUSY; } spin_unlock_irq(&hugetlb_lock); return ret; } int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { int ret; spin_lock_irq(&hugetlb_lock); ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared); spin_unlock_irq(&hugetlb_lock); return ret; } void folio_putback_active_hugetlb(struct folio *folio) { spin_lock_irq(&hugetlb_lock); folio_set_hugetlb_migratable(folio); list_move_tail(&folio->lru, &(folio_hstate(folio))->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); folio_put(folio); } void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason) { struct hstate *h = folio_hstate(old_folio); hugetlb_cgroup_migrate(old_folio, new_folio); set_page_owner_migrate_reason(&new_folio->page, reason); /* * transfer temporary state of the new hugetlb folio. This is * reverse to other transitions because the newpage is going to * be final while the old one will be freed so it takes over * the temporary status. * * Also note that we have to transfer the per-node surplus state * here as well otherwise the global surplus count will not match * the per-node's. */ if (folio_test_hugetlb_temporary(new_folio)) { int old_nid = folio_nid(old_folio); int new_nid = folio_nid(new_folio); folio_set_hugetlb_temporary(old_folio); folio_clear_hugetlb_temporary(new_folio); /* * There is no need to transfer the per-node surplus state * when we do not cross the node. */ if (new_nid == old_nid) return; spin_lock_irq(&hugetlb_lock); if (h->surplus_huge_pages_node[old_nid]) { h->surplus_huge_pages_node[old_nid]--; h->surplus_huge_pages_node[new_nid]++; } spin_unlock_irq(&hugetlb_lock); } } static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); struct mm_struct *mm = vma->vm_mm; struct mmu_notifier_range range; unsigned long address; spinlock_t *ptl; pte_t *ptep; if (!(vma->vm_flags & VM_MAYSHARE)) return; if (start >= end) return; flush_cache_range(vma, start, end); /* * No need to call adjust_range_if_pmd_sharing_possible(), because * we have already done the PUD_SIZE alignment. */ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start, end); mmu_notifier_invalidate_range_start(&range); hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); for (address = start; address < end; address += PUD_SIZE) { ptep = hugetlb_walk(vma, address, sz); if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); huge_pmd_unshare(mm, vma, address, ptep); spin_unlock(ptl); } flush_hugetlb_tlb_range(vma, start, end); i_mmap_unlock_write(vma->vm_file->f_mapping); hugetlb_vma_unlock_write(vma); /* * No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see * Documentation/mm/mmu_notifier.rst. */ mmu_notifier_invalidate_range_end(&range); } /* * This function will unconditionally remove all the shared pmd pgtable entries * within the specific vma for a hugetlbfs memory range. */ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE), ALIGN_DOWN(vma->vm_end, PUD_SIZE)); } #ifdef CONFIG_CMA static bool cma_reserve_called __initdata; static int __init cmdline_parse_hugetlb_cma(char *p) { int nid, count = 0; unsigned long tmp; char *s = p; while (*s) { if (sscanf(s, "%lu%n", &tmp, &count) != 1) break; if (s[count] == ':') { if (tmp >= MAX_NUMNODES) break; nid = array_index_nospec(tmp, MAX_NUMNODES); s += count + 1; tmp = memparse(s, &s); hugetlb_cma_size_in_node[nid] = tmp; hugetlb_cma_size += tmp; /* * Skip the separator if have one, otherwise * break the parsing. */ if (*s == ',') s++; else break; } else { hugetlb_cma_size = memparse(p, &p); break; } } return 0; } early_param("hugetlb_cma", cmdline_parse_hugetlb_cma); void __init hugetlb_cma_reserve(int order) { unsigned long size, reserved, per_node; bool node_specific_cma_alloc = false; int nid; cma_reserve_called = true; if (!hugetlb_cma_size) return; for (nid = 0; nid < MAX_NUMNODES; nid++) { if (hugetlb_cma_size_in_node[nid] == 0) continue; if (!node_online(nid)) { pr_warn("hugetlb_cma: invalid node %d specified\n", nid); hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; hugetlb_cma_size_in_node[nid] = 0; continue; } if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) { pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n", nid, (PAGE_SIZE << order) / SZ_1M); hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; hugetlb_cma_size_in_node[nid] = 0; } else { node_specific_cma_alloc = true; } } /* Validate the CMA size again in case some invalid nodes specified. */ if (!hugetlb_cma_size) return; if (hugetlb_cma_size < (PAGE_SIZE << order)) { pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n", (PAGE_SIZE << order) / SZ_1M); hugetlb_cma_size = 0; return; } if (!node_specific_cma_alloc) { /* * If 3 GB area is requested on a machine with 4 numa nodes, * let's allocate 1 GB on first three nodes and ignore the last one. */ per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes); pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n", hugetlb_cma_size / SZ_1M, per_node / SZ_1M); } reserved = 0; for_each_online_node(nid) { int res; char name[CMA_MAX_NAME]; if (node_specific_cma_alloc) { if (hugetlb_cma_size_in_node[nid] == 0) continue; size = hugetlb_cma_size_in_node[nid]; } else { size = min(per_node, hugetlb_cma_size - reserved); } size = round_up(size, PAGE_SIZE << order); snprintf(name, sizeof(name), "hugetlb%d", nid); /* * Note that 'order per bit' is based on smallest size that * may be returned to CMA allocator in the case of * huge page demotion. */ res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << HUGETLB_PAGE_ORDER, 0, false, name, &hugetlb_cma[nid], nid); if (res) { pr_warn("hugetlb_cma: reservation failed: err %d, node %d", res, nid); continue; } reserved += size; pr_info("hugetlb_cma: reserved %lu MiB on node %d\n", size / SZ_1M, nid); if (reserved >= hugetlb_cma_size) break; } if (!reserved) /* * hugetlb_cma_size is used to determine if allocations from * cma are possible. Set to zero if no cma regions are set up. */ hugetlb_cma_size = 0; } static void __init hugetlb_cma_check(void) { if (!hugetlb_cma_size || cma_reserve_called) return; pr_warn("hugetlb_cma: the option isn't supported by current arch\n"); } #endif /* CONFIG_CMA */
5 1 1 30 30 26 5 1 30 29 30 30 26 6 6 1 30 9 22 26 30 3 3 3 1 1 1 1 1 1 4 1 1 2 1 1 14 1 8 16 16 15 1 4 1 3 8 2 1 1 5 2 4 4 8 2 7 6 5 9 19 19 4 5 7 1 8 2 1 1 8 8 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 // SPDX-License-Identifier: GPL-2.0-only /* * common code for virtio vsock * * Copyright (C) 2013-2015 Red Hat, Inc. * Author: Asias He <asias@redhat.com> * Stefan Hajnoczi <stefanha@redhat.com> */ #include <linux/spinlock.h> #include <linux/module.h> #include <linux/sched/signal.h> #include <linux/ctype.h> #include <linux/list.h> #include <linux/virtio_vsock.h> #include <uapi/linux/vsockmon.h> #include <net/sock.h> #include <net/af_vsock.h> #define CREATE_TRACE_POINTS #include <trace/events/vsock_virtio_transport_common.h> /* How long to wait for graceful shutdown of a connection */ #define VSOCK_CLOSE_TIMEOUT (8 * HZ) /* Threshold for detecting small packets to copy */ #define GOOD_COPY_LEN 128 static const struct virtio_transport * virtio_transport_get_ops(struct vsock_sock *vsk) { const struct vsock_transport *t = vsock_core_get_transport(vsk); if (WARN_ON(!t)) return NULL; return container_of(t, struct virtio_transport, transport); } static bool virtio_transport_can_zcopy(const struct virtio_transport *t_ops, struct virtio_vsock_pkt_info *info, size_t pkt_len) { struct iov_iter *iov_iter; if (!info->msg) return false; iov_iter = &info->msg->msg_iter; if (iov_iter->iov_offset) return false; /* We can't send whole iov. */ if (iov_iter->count > pkt_len) return false; /* Check that transport can send data in zerocopy mode. */ t_ops = virtio_transport_get_ops(info->vsk); if (t_ops->can_msgzerocopy) { int pages_to_send = iov_iter_npages(iov_iter, MAX_SKB_FRAGS); /* +1 is for packet header. */ return t_ops->can_msgzerocopy(pages_to_send + 1); } return true; } static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk, struct sk_buff *skb, struct msghdr *msg, bool zerocopy) { struct ubuf_info *uarg; if (msg->msg_ubuf) { uarg = msg->msg_ubuf; net_zcopy_get(uarg); } else { struct iov_iter *iter = &msg->msg_iter; struct ubuf_info_msgzc *uarg_zc; uarg = msg_zerocopy_realloc(sk_vsock(vsk), iter->count, NULL); if (!uarg) return -1; uarg_zc = uarg_to_msgzc(uarg); uarg_zc->zerocopy = zerocopy ? 1 : 0; } skb_zcopy_init(skb, uarg); return 0; } static int virtio_transport_fill_skb(struct sk_buff *skb, struct virtio_vsock_pkt_info *info, size_t len, bool zcopy) { if (zcopy) return __zerocopy_sg_from_iter(info->msg, NULL, skb, &info->msg->msg_iter, len); return memcpy_from_msg(skb_put(skb, len), info->msg, len); } static void virtio_transport_init_hdr(struct sk_buff *skb, struct virtio_vsock_pkt_info *info, size_t payload_len, u32 src_cid, u32 src_port, u32 dst_cid, u32 dst_port) { struct virtio_vsock_hdr *hdr; hdr = virtio_vsock_hdr(skb); hdr->type = cpu_to_le16(info->type); hdr->op = cpu_to_le16(info->op); hdr->src_cid = cpu_to_le64(src_cid); hdr->dst_cid = cpu_to_le64(dst_cid); hdr->src_port = cpu_to_le32(src_port); hdr->dst_port = cpu_to_le32(dst_port); hdr->flags = cpu_to_le32(info->flags); hdr->len = cpu_to_le32(payload_len); hdr->buf_alloc = cpu_to_le32(0); hdr->fwd_cnt = cpu_to_le32(0); } static void virtio_transport_copy_nonlinear_skb(const struct sk_buff *skb, void *dst, size_t len) { struct iov_iter iov_iter = { 0 }; struct kvec kvec; size_t to_copy; kvec.iov_base = dst; kvec.iov_len = len; iov_iter.iter_type = ITER_KVEC; iov_iter.kvec = &kvec; iov_iter.nr_segs = 1; to_copy = min_t(size_t, len, skb->len); skb_copy_datagram_iter(skb, VIRTIO_VSOCK_SKB_CB(skb)->offset, &iov_iter, to_copy); } /* Packet capture */ static struct sk_buff *virtio_transport_build_skb(void *opaque) { struct virtio_vsock_hdr *pkt_hdr; struct sk_buff *pkt = opaque; struct af_vsockmon_hdr *hdr; struct sk_buff *skb; size_t payload_len; /* A packet could be split to fit the RX buffer, so we can retrieve * the payload length from the header and the buffer pointer taking * care of the offset in the original packet. */ pkt_hdr = virtio_vsock_hdr(pkt); payload_len = pkt->len; skb = alloc_skb(sizeof(*hdr) + sizeof(*pkt_hdr) + payload_len, GFP_ATOMIC); if (!skb) return NULL; hdr = skb_put(skb, sizeof(*hdr)); /* pkt->hdr is little-endian so no need to byteswap here */ hdr->src_cid = pkt_hdr->src_cid; hdr->src_port = pkt_hdr->src_port; hdr->dst_cid = pkt_hdr->dst_cid; hdr->dst_port = pkt_hdr->dst_port; hdr->transport = cpu_to_le16(AF_VSOCK_TRANSPORT_VIRTIO); hdr->len = cpu_to_le16(sizeof(*pkt_hdr)); memset(hdr->reserved, 0, sizeof(hdr->reserved)); switch (le16_to_cpu(pkt_hdr->op)) { case VIRTIO_VSOCK_OP_REQUEST: case VIRTIO_VSOCK_OP_RESPONSE: hdr->op = cpu_to_le16(AF_VSOCK_OP_CONNECT); break; case VIRTIO_VSOCK_OP_RST: case VIRTIO_VSOCK_OP_SHUTDOWN: hdr->op = cpu_to_le16(AF_VSOCK_OP_DISCONNECT); break; case VIRTIO_VSOCK_OP_RW: hdr->op = cpu_to_le16(AF_VSOCK_OP_PAYLOAD); break; case VIRTIO_VSOCK_OP_CREDIT_UPDATE: case VIRTIO_VSOCK_OP_CREDIT_REQUEST: hdr->op = cpu_to_le16(AF_VSOCK_OP_CONTROL); break; default: hdr->op = cpu_to_le16(AF_VSOCK_OP_UNKNOWN); break; } skb_put_data(skb, pkt_hdr, sizeof(*pkt_hdr)); if (payload_len) { if (skb_is_nonlinear(pkt)) { void *data = skb_put(skb, payload_len); virtio_transport_copy_nonlinear_skb(pkt, data, payload_len); } else { skb_put_data(skb, pkt->data, payload_len); } } return skb; } void virtio_transport_deliver_tap_pkt(struct sk_buff *skb) { if (virtio_vsock_skb_tap_delivered(skb)) return; vsock_deliver_tap(virtio_transport_build_skb, skb); virtio_vsock_skb_set_tap_delivered(skb); } EXPORT_SYMBOL_GPL(virtio_transport_deliver_tap_pkt); static u16 virtio_transport_get_type(struct sock *sk) { if (sk->sk_type == SOCK_STREAM) return VIRTIO_VSOCK_TYPE_STREAM; else return VIRTIO_VSOCK_TYPE_SEQPACKET; } /* Returns new sk_buff on success, otherwise returns NULL. */ static struct sk_buff *virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info, size_t payload_len, bool zcopy, u32 src_cid, u32 src_port, u32 dst_cid, u32 dst_port) { struct vsock_sock *vsk; struct sk_buff *skb; size_t skb_len; skb_len = VIRTIO_VSOCK_SKB_HEADROOM; if (!zcopy) skb_len += payload_len; skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL); if (!skb) return NULL; virtio_transport_init_hdr(skb, info, payload_len, src_cid, src_port, dst_cid, dst_port); vsk = info->vsk; /* If 'vsk' != NULL then payload is always present, so we * will never call '__zerocopy_sg_from_iter()' below without * setting skb owner in 'skb_set_owner_w()'. The only case * when 'vsk' == NULL is VIRTIO_VSOCK_OP_RST control message * without payload. */ WARN_ON_ONCE(!(vsk && (info->msg && payload_len)) && zcopy); /* Set owner here, because '__zerocopy_sg_from_iter()' uses * owner of skb without check to update 'sk_wmem_alloc'. */ if (vsk) skb_set_owner_w(skb, sk_vsock(vsk)); if (info->msg && payload_len > 0) { int err; err = virtio_transport_fill_skb(skb, info, payload_len, zcopy); if (err) goto out; if (msg_data_left(info->msg) == 0 && info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) { struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM); if (info->msg->msg_flags & MSG_EOR) hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR); } } if (info->reply) virtio_vsock_skb_set_reply(skb); trace_virtio_transport_alloc_pkt(src_cid, src_port, dst_cid, dst_port, payload_len, info->type, info->op, info->flags, zcopy); return skb; out: kfree_skb(skb); return NULL; } /* This function can only be used on connecting/connected sockets, * since a socket assigned to a transport is required. * * Do not use on listener sockets! */ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, struct virtio_vsock_pkt_info *info) { u32 max_skb_len = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE; u32 src_cid, src_port, dst_cid, dst_port; const struct virtio_transport *t_ops; struct virtio_vsock_sock *vvs; u32 pkt_len = info->pkt_len; bool can_zcopy = false; u32 rest_len; int ret; info->type = virtio_transport_get_type(sk_vsock(vsk)); t_ops = virtio_transport_get_ops(vsk); if (unlikely(!t_ops)) return -EFAULT; src_cid = t_ops->transport.get_local_cid(); src_port = vsk->local_addr.svm_port; if (!info->remote_cid) { dst_cid = vsk->remote_addr.svm_cid; dst_port = vsk->remote_addr.svm_port; } else { dst_cid = info->remote_cid; dst_port = info->remote_port; } vvs = vsk->trans; /* virtio_transport_get_credit might return less than pkt_len credit */ pkt_len = virtio_transport_get_credit(vvs, pkt_len); /* Do not send zero length OP_RW pkt */ if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW) return pkt_len; if (info->msg) { /* If zerocopy is not enabled by 'setsockopt()', we behave as * there is no MSG_ZEROCOPY flag set. */ if (!sock_flag(sk_vsock(vsk), SOCK_ZEROCOPY)) info->msg->msg_flags &= ~MSG_ZEROCOPY; if (info->msg->msg_flags & MSG_ZEROCOPY) can_zcopy = virtio_transport_can_zcopy(t_ops, info, pkt_len); if (can_zcopy) max_skb_len = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE, (MAX_SKB_FRAGS * PAGE_SIZE)); } rest_len = pkt_len; do { struct sk_buff *skb; size_t skb_len; skb_len = min(max_skb_len, rest_len); skb = virtio_transport_alloc_skb(info, skb_len, can_zcopy, src_cid, src_port, dst_cid, dst_port); if (!skb) { ret = -ENOMEM; break; } /* We process buffer part by part, allocating skb on * each iteration. If this is last skb for this buffer * and MSG_ZEROCOPY mode is in use - we must allocate * completion for the current syscall. */ if (info->msg && info->msg->msg_flags & MSG_ZEROCOPY && skb_len == rest_len && info->op == VIRTIO_VSOCK_OP_RW) { if (virtio_transport_init_zcopy_skb(vsk, skb, info->msg, can_zcopy)) { ret = -ENOMEM; break; } } virtio_transport_inc_tx_pkt(vvs, skb); ret = t_ops->send_pkt(skb); if (ret < 0) break; /* Both virtio and vhost 'send_pkt()' returns 'skb_len', * but for reliability use 'ret' instead of 'skb_len'. * Also if partial send happens (e.g. 'ret' != 'skb_len') * somehow, we break this loop, but account such returned * value in 'virtio_transport_put_credit()'. */ rest_len -= ret; if (WARN_ONCE(ret != skb_len, "'send_pkt()' returns %i, but %zu expected\n", ret, skb_len)) break; } while (rest_len); virtio_transport_put_credit(vvs, rest_len); /* Return number of bytes, if any data has been sent. */ if (rest_len != pkt_len) ret = pkt_len - rest_len; return ret; } static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, u32 len) { if (vvs->rx_bytes + len > vvs->buf_alloc) return false; vvs->rx_bytes += len; return true; } static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs, u32 len) { vvs->rx_bytes -= len; vvs->fwd_cnt += len; } void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct sk_buff *skb) { struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); spin_lock_bh(&vvs->rx_lock); vvs->last_fwd_cnt = vvs->fwd_cnt; hdr->fwd_cnt = cpu_to_le32(vvs->fwd_cnt); hdr->buf_alloc = cpu_to_le32(vvs->buf_alloc); spin_unlock_bh(&vvs->rx_lock); } EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt); u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit) { u32 ret; if (!credit) return 0; spin_lock_bh(&vvs->tx_lock); ret = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt); if (ret > credit) ret = credit; vvs->tx_cnt += ret; spin_unlock_bh(&vvs->tx_lock); return ret; } EXPORT_SYMBOL_GPL(virtio_transport_get_credit); void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit) { if (!credit) return; spin_lock_bh(&vvs->tx_lock); vvs->tx_cnt -= credit; spin_unlock_bh(&vvs->tx_lock); } EXPORT_SYMBOL_GPL(virtio_transport_put_credit); static int virtio_transport_send_credit_update(struct vsock_sock *vsk) { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE, .vsk = vsk, }; return virtio_transport_send_pkt_info(vsk, &info); } static ssize_t virtio_transport_stream_do_peek(struct vsock_sock *vsk, struct msghdr *msg, size_t len) { struct virtio_vsock_sock *vvs = vsk->trans; struct sk_buff *skb; size_t total = 0; int err; spin_lock_bh(&vvs->rx_lock); skb_queue_walk(&vvs->rx_queue, skb) { size_t bytes; bytes = len - total; if (bytes > skb->len) bytes = skb->len; spin_unlock_bh(&vvs->rx_lock); /* sk_lock is held by caller so no one else can dequeue. * Unlock rx_lock since skb_copy_datagram_iter() may sleep. */ err = skb_copy_datagram_iter(skb, VIRTIO_VSOCK_SKB_CB(skb)->offset, &msg->msg_iter, bytes); if (err) goto out; total += bytes; spin_lock_bh(&vvs->rx_lock); if (total == len) break; } spin_unlock_bh(&vvs->rx_lock); return total; out: if (total) err = total; return err; } static ssize_t virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, struct msghdr *msg, size_t len) { struct virtio_vsock_sock *vvs = vsk->trans; size_t bytes, total = 0; struct sk_buff *skb; u32 fwd_cnt_delta; bool low_rx_bytes; int err = -EFAULT; u32 free_space; spin_lock_bh(&vvs->rx_lock); if (WARN_ONCE(skb_queue_empty(&vvs->rx_queue) && vvs->rx_bytes, "rx_queue is empty, but rx_bytes is non-zero\n")) { spin_unlock_bh(&vvs->rx_lock); return err; } while (total < len && !skb_queue_empty(&vvs->rx_queue)) { skb = skb_peek(&vvs->rx_queue); bytes = min_t(size_t, len - total, skb->len - VIRTIO_VSOCK_SKB_CB(skb)->offset); /* sk_lock is held by caller so no one else can dequeue. * Unlock rx_lock since skb_copy_datagram_iter() may sleep. */ spin_unlock_bh(&vvs->rx_lock); err = skb_copy_datagram_iter(skb, VIRTIO_VSOCK_SKB_CB(skb)->offset, &msg->msg_iter, bytes); if (err) goto out; spin_lock_bh(&vvs->rx_lock); total += bytes; VIRTIO_VSOCK_SKB_CB(skb)->offset += bytes; if (skb->len == VIRTIO_VSOCK_SKB_CB(skb)->offset) { u32 pkt_len = le32_to_cpu(virtio_vsock_hdr(skb)->len); virtio_transport_dec_rx_pkt(vvs, pkt_len); __skb_unlink(skb, &vvs->rx_queue); consume_skb(skb); } } fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt; free_space = vvs->buf_alloc - fwd_cnt_delta; low_rx_bytes = (vvs->rx_bytes < sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX)); spin_unlock_bh(&vvs->rx_lock); /* To reduce the number of credit update messages, * don't update credits as long as lots of space is available. * Note: the limit chosen here is arbitrary. Setting the limit * too high causes extra messages. Too low causes transmitter * stalls. As stalls are in theory more expensive than extra * messages, we set the limit to a high value. TODO: experiment * with different values. Also send credit update message when * number of bytes in rx queue is not enough to wake up reader. */ if (fwd_cnt_delta && (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || low_rx_bytes)) virtio_transport_send_credit_update(vsk); return total; out: if (total) err = total; return err; } static ssize_t virtio_transport_seqpacket_do_peek(struct vsock_sock *vsk, struct msghdr *msg) { struct virtio_vsock_sock *vvs = vsk->trans; struct sk_buff *skb; size_t total, len; spin_lock_bh(&vvs->rx_lock); if (!vvs->msg_count) { spin_unlock_bh(&vvs->rx_lock); return 0; } total = 0; len = msg_data_left(msg); skb_queue_walk(&vvs->rx_queue, skb) { struct virtio_vsock_hdr *hdr; if (total < len) { size_t bytes; int err; bytes = len - total; if (bytes > skb->len) bytes = skb->len; spin_unlock_bh(&vvs->rx_lock); /* sk_lock is held by caller so no one else can dequeue. * Unlock rx_lock since skb_copy_datagram_iter() may sleep. */ err = skb_copy_datagram_iter(skb, VIRTIO_VSOCK_SKB_CB(skb)->offset, &msg->msg_iter, bytes); if (err) return err; spin_lock_bh(&vvs->rx_lock); } total += skb->len; hdr = virtio_vsock_hdr(skb); if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) { if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR) msg->msg_flags |= MSG_EOR; break; } } spin_unlock_bh(&vvs->rx_lock); return total; } static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, struct msghdr *msg, int flags) { struct virtio_vsock_sock *vvs = vsk->trans; int dequeued_len = 0; size_t user_buf_len = msg_data_left(msg); bool msg_ready = false; struct sk_buff *skb; spin_lock_bh(&vvs->rx_lock); if (vvs->msg_count == 0) { spin_unlock_bh(&vvs->rx_lock); return 0; } while (!msg_ready) { struct virtio_vsock_hdr *hdr; size_t pkt_len; skb = __skb_dequeue(&vvs->rx_queue); if (!skb) break; hdr = virtio_vsock_hdr(skb); pkt_len = (size_t)le32_to_cpu(hdr->len); if (dequeued_len >= 0) { size_t bytes_to_copy; bytes_to_copy = min(user_buf_len, pkt_len); if (bytes_to_copy) { int err; /* sk_lock is held by caller so no one else can dequeue. * Unlock rx_lock since skb_copy_datagram_iter() may sleep. */ spin_unlock_bh(&vvs->rx_lock); err = skb_copy_datagram_iter(skb, 0, &msg->msg_iter, bytes_to_copy); if (err) { /* Copy of message failed. Rest of * fragments will be freed without copy. */ dequeued_len = err; } else { user_buf_len -= bytes_to_copy; } spin_lock_bh(&vvs->rx_lock); } if (dequeued_len >= 0) dequeued_len += pkt_len; } if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) { msg_ready = true; vvs->msg_count--; if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR) msg->msg_flags |= MSG_EOR; } virtio_transport_dec_rx_pkt(vvs, pkt_len); kfree_skb(skb); } spin_unlock_bh(&vvs->rx_lock); virtio_transport_send_credit_update(vsk); return dequeued_len; } ssize_t virtio_transport_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg, size_t len, int flags) { if (flags & MSG_PEEK) return virtio_transport_stream_do_peek(vsk, msg, len); else return virtio_transport_stream_do_dequeue(vsk, msg, len); } EXPORT_SYMBOL_GPL(virtio_transport_stream_dequeue); ssize_t virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk, struct msghdr *msg, int flags) { if (flags & MSG_PEEK) return virtio_transport_seqpacket_do_peek(vsk, msg); else return virtio_transport_seqpacket_do_dequeue(vsk, msg, flags); } EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_dequeue); int virtio_transport_seqpacket_enqueue(struct vsock_sock *vsk, struct msghdr *msg, size_t len) { struct virtio_vsock_sock *vvs = vsk->trans; spin_lock_bh(&vvs->tx_lock); if (len > vvs->peer_buf_alloc) { spin_unlock_bh(&vvs->tx_lock); return -EMSGSIZE; } spin_unlock_bh(&vvs->tx_lock); return virtio_transport_stream_enqueue(vsk, msg, len); } EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_enqueue); int virtio_transport_dgram_dequeue(struct vsock_sock *vsk, struct msghdr *msg, size_t len, int flags) { return -EOPNOTSUPP; } EXPORT_SYMBOL_GPL(virtio_transport_dgram_dequeue); s64 virtio_transport_stream_has_data(struct vsock_sock *vsk) { struct virtio_vsock_sock *vvs = vsk->trans; s64 bytes; spin_lock_bh(&vvs->rx_lock); bytes = vvs->rx_bytes; spin_unlock_bh(&vvs->rx_lock); return bytes; } EXPORT_SYMBOL_GPL(virtio_transport_stream_has_data); u32 virtio_transport_seqpacket_has_data(struct vsock_sock *vsk) { struct virtio_vsock_sock *vvs = vsk->trans; u32 msg_count; spin_lock_bh(&vvs->rx_lock); msg_count = vvs->msg_count; spin_unlock_bh(&vvs->rx_lock); return msg_count; } EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_has_data); static s64 virtio_transport_has_space(struct vsock_sock *vsk) { struct virtio_vsock_sock *vvs = vsk->trans; s64 bytes; bytes = (s64)vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt); if (bytes < 0) bytes = 0; return bytes; } s64 virtio_transport_stream_has_space(struct vsock_sock *vsk) { struct virtio_vsock_sock *vvs = vsk->trans; s64 bytes; spin_lock_bh(&vvs->tx_lock); bytes = virtio_transport_has_space(vsk); spin_unlock_bh(&vvs->tx_lock); return bytes; } EXPORT_SYMBOL_GPL(virtio_transport_stream_has_space); int virtio_transport_do_socket_init(struct vsock_sock *vsk, struct vsock_sock *psk) { struct virtio_vsock_sock *vvs; vvs = kzalloc(sizeof(*vvs), GFP_KERNEL); if (!vvs) return -ENOMEM; vsk->trans = vvs; vvs->vsk = vsk; if (psk && psk->trans) { struct virtio_vsock_sock *ptrans = psk->trans; vvs->peer_buf_alloc = ptrans->peer_buf_alloc; } if (vsk->buffer_size > VIRTIO_VSOCK_MAX_BUF_SIZE) vsk->buffer_size = VIRTIO_VSOCK_MAX_BUF_SIZE; vvs->buf_alloc = vsk->buffer_size; spin_lock_init(&vvs->rx_lock); spin_lock_init(&vvs->tx_lock); skb_queue_head_init(&vvs->rx_queue); return 0; } EXPORT_SYMBOL_GPL(virtio_transport_do_socket_init); /* sk_lock held by the caller */ void virtio_transport_notify_buffer_size(struct vsock_sock *vsk, u64 *val) { struct virtio_vsock_sock *vvs = vsk->trans; if (*val > VIRTIO_VSOCK_MAX_BUF_SIZE) *val = VIRTIO_VSOCK_MAX_BUF_SIZE; vvs->buf_alloc = *val; virtio_transport_send_credit_update(vsk); } EXPORT_SYMBOL_GPL(virtio_transport_notify_buffer_size); int virtio_transport_notify_poll_in(struct vsock_sock *vsk, size_t target, bool *data_ready_now) { *data_ready_now = vsock_stream_has_data(vsk) >= target; return 0; } EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_in); int virtio_transport_notify_poll_out(struct vsock_sock *vsk, size_t target, bool *space_avail_now) { s64 free_space; free_space = vsock_stream_has_space(vsk); if (free_space > 0) *space_avail_now = true; else if (free_space == 0) *space_avail_now = false; return 0; } EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_out); int virtio_transport_notify_recv_init(struct vsock_sock *vsk, size_t target, struct vsock_transport_recv_notify_data *data) { return 0; } EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_init); int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk, size_t target, struct vsock_transport_recv_notify_data *data) { return 0; } EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_block); int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk, size_t target, struct vsock_transport_recv_notify_data *data) { return 0; } EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_dequeue); int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk, size_t target, ssize_t copied, bool data_read, struct vsock_transport_recv_notify_data *data) { return 0; } EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_post_dequeue); int virtio_transport_notify_send_init(struct vsock_sock *vsk, struct vsock_transport_send_notify_data *data) { return 0; } EXPORT_SYMBOL_GPL(virtio_transport_notify_send_init); int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk, struct vsock_transport_send_notify_data *data) { return 0; } EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_block); int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk, struct vsock_transport_send_notify_data *data) { return 0; } EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_enqueue); int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk, ssize_t written, struct vsock_transport_send_notify_data *data) { return 0; } EXPORT_SYMBOL_GPL(virtio_transport_notify_send_post_enqueue); u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk) { return vsk->buffer_size; } EXPORT_SYMBOL_GPL(virtio_transport_stream_rcvhiwat); bool virtio_transport_stream_is_active(struct vsock_sock *vsk) { return true; } EXPORT_SYMBOL_GPL(virtio_transport_stream_is_active); bool virtio_transport_stream_allow(u32 cid, u32 port) { return true; } EXPORT_SYMBOL_GPL(virtio_transport_stream_allow); int virtio_transport_dgram_bind(struct vsock_sock *vsk, struct sockaddr_vm *addr) { return -EOPNOTSUPP; } EXPORT_SYMBOL_GPL(virtio_transport_dgram_bind); bool virtio_transport_dgram_allow(u32 cid, u32 port) { return false; } EXPORT_SYMBOL_GPL(virtio_transport_dgram_allow); int virtio_transport_connect(struct vsock_sock *vsk) { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_REQUEST, .vsk = vsk, }; return virtio_transport_send_pkt_info(vsk, &info); } EXPORT_SYMBOL_GPL(virtio_transport_connect); int virtio_transport_shutdown(struct vsock_sock *vsk, int mode) { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_SHUTDOWN, .flags = (mode & RCV_SHUTDOWN ? VIRTIO_VSOCK_SHUTDOWN_RCV : 0) | (mode & SEND_SHUTDOWN ? VIRTIO_VSOCK_SHUTDOWN_SEND : 0), .vsk = vsk, }; return virtio_transport_send_pkt_info(vsk, &info); } EXPORT_SYMBOL_GPL(virtio_transport_shutdown); int virtio_transport_dgram_enqueue(struct vsock_sock *vsk, struct sockaddr_vm *remote_addr, struct msghdr *msg, size_t dgram_len) { return -EOPNOTSUPP; } EXPORT_SYMBOL_GPL(virtio_transport_dgram_enqueue); ssize_t virtio_transport_stream_enqueue(struct vsock_sock *vsk, struct msghdr *msg, size_t len) { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RW, .msg = msg, .pkt_len = len, .vsk = vsk, }; return virtio_transport_send_pkt_info(vsk, &info); } EXPORT_SYMBOL_GPL(virtio_transport_stream_enqueue); void virtio_transport_destruct(struct vsock_sock *vsk) { struct virtio_vsock_sock *vvs = vsk->trans; kfree(vvs); } EXPORT_SYMBOL_GPL(virtio_transport_destruct); static int virtio_transport_reset(struct vsock_sock *vsk, struct sk_buff *skb) { struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RST, .reply = !!skb, .vsk = vsk, }; /* Send RST only if the original pkt is not a RST pkt */ if (skb && le16_to_cpu(virtio_vsock_hdr(skb)->op) == VIRTIO_VSOCK_OP_RST) return 0; return virtio_transport_send_pkt_info(vsk, &info); } /* Normally packets are associated with a socket. There may be no socket if an * attempt was made to connect to a socket that does not exist. */ static int virtio_transport_reset_no_sock(const struct virtio_transport *t, struct sk_buff *skb) { struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RST, .type = le16_to_cpu(hdr->type), .reply = true, }; struct sk_buff *reply; /* Send RST only if the original pkt is not a RST pkt */ if (le16_to_cpu(hdr->op) == VIRTIO_VSOCK_OP_RST) return 0; if (!t) return -ENOTCONN; reply = virtio_transport_alloc_skb(&info, 0, false, le64_to_cpu(hdr->dst_cid), le32_to_cpu(hdr->dst_port), le64_to_cpu(hdr->src_cid), le32_to_cpu(hdr->src_port)); if (!reply) return -ENOMEM; return t->send_pkt(reply); } /* This function should be called with sk_lock held and SOCK_DONE set */ static void virtio_transport_remove_sock(struct vsock_sock *vsk) { struct virtio_vsock_sock *vvs = vsk->trans; /* We don't need to take rx_lock, as the socket is closing and we are * removing it. */ __skb_queue_purge(&vvs->rx_queue); vsock_remove_sock(vsk); } static void virtio_transport_wait_close(struct sock *sk, long timeout) { if (timeout) { DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(sk_sleep(sk), &wait); do { if (sk_wait_event(sk, &timeout, sock_flag(sk, SOCK_DONE), &wait)) break; } while (!signal_pending(current) && timeout); remove_wait_queue(sk_sleep(sk), &wait); } } static void virtio_transport_do_close(struct vsock_sock *vsk, bool cancel_timeout) { struct sock *sk = sk_vsock(vsk); sock_set_flag(sk, SOCK_DONE); vsk->peer_shutdown = SHUTDOWN_MASK; if (vsock_stream_has_data(vsk) <= 0) sk->sk_state = TCP_CLOSING; sk->sk_state_change(sk); if (vsk->close_work_scheduled && (!cancel_timeout || cancel_delayed_work(&vsk->close_work))) { vsk->close_work_scheduled = false; virtio_transport_remove_sock(vsk); /* Release refcnt obtained when we scheduled the timeout */ sock_put(sk); } } static void virtio_transport_close_timeout(struct work_struct *work) { struct vsock_sock *vsk = container_of(work, struct vsock_sock, close_work.work); struct sock *sk = sk_vsock(vsk); sock_hold(sk); lock_sock(sk); if (!sock_flag(sk, SOCK_DONE)) { (void)virtio_transport_reset(vsk, NULL); virtio_transport_do_close(vsk, false); } vsk->close_work_scheduled = false; release_sock(sk); sock_put(sk); } /* User context, vsk->sk is locked */ static bool virtio_transport_close(struct vsock_sock *vsk) { struct sock *sk = &vsk->sk; if (!(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_CLOSING)) return true; /* Already received SHUTDOWN from peer, reply with RST */ if ((vsk->peer_shutdown & SHUTDOWN_MASK) == SHUTDOWN_MASK) { (void)virtio_transport_reset(vsk, NULL); return true; } if ((sk->sk_shutdown & SHUTDOWN_MASK) != SHUTDOWN_MASK) (void)virtio_transport_shutdown(vsk, SHUTDOWN_MASK); if (sock_flag(sk, SOCK_LINGER) && !(current->flags & PF_EXITING)) virtio_transport_wait_close(sk, sk->sk_lingertime); if (sock_flag(sk, SOCK_DONE)) { return true; } sock_hold(sk); INIT_DELAYED_WORK(&vsk->close_work, virtio_transport_close_timeout); vsk->close_work_scheduled = true; schedule_delayed_work(&vsk->close_work, VSOCK_CLOSE_TIMEOUT); return false; } void virtio_transport_release(struct vsock_sock *vsk) { struct sock *sk = &vsk->sk; bool remove_sock = true; if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) remove_sock = virtio_transport_close(vsk); if (remove_sock) { sock_set_flag(sk, SOCK_DONE); virtio_transport_remove_sock(vsk); } } EXPORT_SYMBOL_GPL(virtio_transport_release); static int virtio_transport_recv_connecting(struct sock *sk, struct sk_buff *skb) { struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct vsock_sock *vsk = vsock_sk(sk); int skerr; int err; switch (le16_to_cpu(hdr->op)) { case VIRTIO_VSOCK_OP_RESPONSE: sk->sk_state = TCP_ESTABLISHED; sk->sk_socket->state = SS_CONNECTED; vsock_insert_connected(vsk); sk->sk_state_change(sk); break; case VIRTIO_VSOCK_OP_INVALID: break; case VIRTIO_VSOCK_OP_RST: skerr = ECONNRESET; err = 0; goto destroy; default: skerr = EPROTO; err = -EINVAL; goto destroy; } return 0; destroy: virtio_transport_reset(vsk, skb); sk->sk_state = TCP_CLOSE; sk->sk_err = skerr; sk_error_report(sk); return err; } static void virtio_transport_recv_enqueue(struct vsock_sock *vsk, struct sk_buff *skb) { struct virtio_vsock_sock *vvs = vsk->trans; bool can_enqueue, free_pkt = false; struct virtio_vsock_hdr *hdr; u32 len; hdr = virtio_vsock_hdr(skb); len = le32_to_cpu(hdr->len); spin_lock_bh(&vvs->rx_lock); can_enqueue = virtio_transport_inc_rx_pkt(vvs, len); if (!can_enqueue) { free_pkt = true; goto out; } if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) vvs->msg_count++; /* Try to copy small packets into the buffer of last packet queued, * to avoid wasting memory queueing the entire buffer with a small * payload. */ if (len <= GOOD_COPY_LEN && !skb_queue_empty(&vvs->rx_queue)) { struct virtio_vsock_hdr *last_hdr; struct sk_buff *last_skb; last_skb = skb_peek_tail(&vvs->rx_queue); last_hdr = virtio_vsock_hdr(last_skb); /* If there is space in the last packet queued, we copy the * new packet in its buffer. We avoid this if the last packet * queued has VIRTIO_VSOCK_SEQ_EOM set, because this is * delimiter of SEQPACKET message, so 'pkt' is the first packet * of a new message. */ if (skb->len < skb_tailroom(last_skb) && !(le32_to_cpu(last_hdr->flags) & VIRTIO_VSOCK_SEQ_EOM)) { memcpy(skb_put(last_skb, skb->len), skb->data, skb->len); free_pkt = true; last_hdr->flags |= hdr->flags; le32_add_cpu(&last_hdr->len, len); goto out; } } __skb_queue_tail(&vvs->rx_queue, skb); out: spin_unlock_bh(&vvs->rx_lock); if (free_pkt) kfree_skb(skb); } static int virtio_transport_recv_connected(struct sock *sk, struct sk_buff *skb) { struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct vsock_sock *vsk = vsock_sk(sk); int err = 0; switch (le16_to_cpu(hdr->op)) { case VIRTIO_VSOCK_OP_RW: virtio_transport_recv_enqueue(vsk, skb); vsock_data_ready(sk); return err; case VIRTIO_VSOCK_OP_CREDIT_REQUEST: virtio_transport_send_credit_update(vsk); break; case VIRTIO_VSOCK_OP_CREDIT_UPDATE: sk->sk_write_space(sk); break; case VIRTIO_VSOCK_OP_SHUTDOWN: if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SHUTDOWN_RCV) vsk->peer_shutdown |= RCV_SHUTDOWN; if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SHUTDOWN_SEND) vsk->peer_shutdown |= SEND_SHUTDOWN; if (vsk->peer_shutdown == SHUTDOWN_MASK) { if (vsock_stream_has_data(vsk) <= 0 && !sock_flag(sk, SOCK_DONE)) { (void)virtio_transport_reset(vsk, NULL); virtio_transport_do_close(vsk, true); } /* Remove this socket anyway because the remote peer sent * the shutdown. This way a new connection will succeed * if the remote peer uses the same source port, * even if the old socket is still unreleased, but now disconnected. */ vsock_remove_sock(vsk); } if (le32_to_cpu(virtio_vsock_hdr(skb)->flags)) sk->sk_state_change(sk); break; case VIRTIO_VSOCK_OP_RST: virtio_transport_do_close(vsk, true); break; default: err = -EINVAL; break; } kfree_skb(skb); return err; } static void virtio_transport_recv_disconnecting(struct sock *sk, struct sk_buff *skb) { struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct vsock_sock *vsk = vsock_sk(sk); if (le16_to_cpu(hdr->op) == VIRTIO_VSOCK_OP_RST) virtio_transport_do_close(vsk, true); } static int virtio_transport_send_response(struct vsock_sock *vsk, struct sk_buff *skb) { struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RESPONSE, .remote_cid = le64_to_cpu(hdr->src_cid), .remote_port = le32_to_cpu(hdr->src_port), .reply = true, .vsk = vsk, }; return virtio_transport_send_pkt_info(vsk, &info); } static bool virtio_transport_space_update(struct sock *sk, struct sk_buff *skb) { struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct vsock_sock *vsk = vsock_sk(sk); struct virtio_vsock_sock *vvs = vsk->trans; bool space_available; /* Listener sockets are not associated with any transport, so we are * not able to take the state to see if there is space available in the * remote peer, but since they are only used to receive requests, we * can assume that there is always space available in the other peer. */ if (!vvs) return true; /* buf_alloc and fwd_cnt is always included in the hdr */ spin_lock_bh(&vvs->tx_lock); vvs->peer_buf_alloc = le32_to_cpu(hdr->buf_alloc); vvs->peer_fwd_cnt = le32_to_cpu(hdr->fwd_cnt); space_available = virtio_transport_has_space(vsk); spin_unlock_bh(&vvs->tx_lock); return space_available; } /* Handle server socket */ static int virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb, struct virtio_transport *t) { struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct vsock_sock *vsk = vsock_sk(sk); struct vsock_sock *vchild; struct sock *child; int ret; if (le16_to_cpu(hdr->op) != VIRTIO_VSOCK_OP_REQUEST) { virtio_transport_reset_no_sock(t, skb); return -EINVAL; } if (sk_acceptq_is_full(sk)) { virtio_transport_reset_no_sock(t, skb); return -ENOMEM; } child = vsock_create_connected(sk); if (!child) { virtio_transport_reset_no_sock(t, skb); return -ENOMEM; } sk_acceptq_added(sk); lock_sock_nested(child, SINGLE_DEPTH_NESTING); child->sk_state = TCP_ESTABLISHED; vchild = vsock_sk(child); vsock_addr_init(&vchild->local_addr, le64_to_cpu(hdr->dst_cid), le32_to_cpu(hdr->dst_port)); vsock_addr_init(&vchild->remote_addr, le64_to_cpu(hdr->src_cid), le32_to_cpu(hdr->src_port)); ret = vsock_assign_transport(vchild, vsk); /* Transport assigned (looking at remote_addr) must be the same * where we received the request. */ if (ret || vchild->transport != &t->transport) { release_sock(child); virtio_transport_reset_no_sock(t, skb); sock_put(child); return ret; } if (virtio_transport_space_update(child, skb)) child->sk_write_space(child); vsock_insert_connected(vchild); vsock_enqueue_accept(sk, child); virtio_transport_send_response(vchild, skb); release_sock(child); sk->sk_data_ready(sk); return 0; } static bool virtio_transport_valid_type(u16 type) { return (type == VIRTIO_VSOCK_TYPE_STREAM) || (type == VIRTIO_VSOCK_TYPE_SEQPACKET); } /* We are under the virtio-vsock's vsock->rx_lock or vhost-vsock's vq->mutex * lock. */ void virtio_transport_recv_pkt(struct virtio_transport *t, struct sk_buff *skb) { struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct sockaddr_vm src, dst; struct vsock_sock *vsk; struct sock *sk; bool space_available; vsock_addr_init(&src, le64_to_cpu(hdr->src_cid), le32_to_cpu(hdr->src_port)); vsock_addr_init(&dst, le64_to_cpu(hdr->dst_cid), le32_to_cpu(hdr->dst_port)); trace_virtio_transport_recv_pkt(src.svm_cid, src.svm_port, dst.svm_cid, dst.svm_port, le32_to_cpu(hdr->len), le16_to_cpu(hdr->type), le16_to_cpu(hdr->op), le32_to_cpu(hdr->flags), le32_to_cpu(hdr->buf_alloc), le32_to_cpu(hdr->fwd_cnt)); if (!virtio_transport_valid_type(le16_to_cpu(hdr->type))) { (void)virtio_transport_reset_no_sock(t, skb); goto free_pkt; } /* The socket must be in connected or bound table * otherwise send reset back */ sk = vsock_find_connected_socket(&src, &dst); if (!sk) { sk = vsock_find_bound_socket(&dst); if (!sk) { (void)virtio_transport_reset_no_sock(t, skb); goto free_pkt; } } if (virtio_transport_get_type(sk) != le16_to_cpu(hdr->type)) { (void)virtio_transport_reset_no_sock(t, skb); sock_put(sk); goto free_pkt; } if (!skb_set_owner_sk_safe(skb, sk)) { WARN_ONCE(1, "receiving vsock socket has sk_refcnt == 0\n"); goto free_pkt; } vsk = vsock_sk(sk); lock_sock(sk); /* Check if sk has been closed before lock_sock */ if (sock_flag(sk, SOCK_DONE)) { (void)virtio_transport_reset_no_sock(t, skb); release_sock(sk); sock_put(sk); goto free_pkt; } space_available = virtio_transport_space_update(sk, skb); /* Update CID in case it has changed after a transport reset event */ if (vsk->local_addr.svm_cid != VMADDR_CID_ANY) vsk->local_addr.svm_cid = dst.svm_cid; if (space_available) sk->sk_write_space(sk); switch (sk->sk_state) { case TCP_LISTEN: virtio_transport_recv_listen(sk, skb, t); kfree_skb(skb); break; case TCP_SYN_SENT: virtio_transport_recv_connecting(sk, skb); kfree_skb(skb); break; case TCP_ESTABLISHED: virtio_transport_recv_connected(sk, skb); break; case TCP_CLOSING: virtio_transport_recv_disconnecting(sk, skb); kfree_skb(skb); break; default: (void)virtio_transport_reset_no_sock(t, skb); kfree_skb(skb); break; } release_sock(sk); /* Release refcnt obtained when we fetched this socket out of the * bound or connected list. */ sock_put(sk); return; free_pkt: kfree_skb(skb); } EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt); /* Remove skbs found in a queue that have a vsk that matches. * * Each skb is freed. * * Returns the count of skbs that were reply packets. */ int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *queue) { struct sk_buff_head freeme; struct sk_buff *skb, *tmp; int cnt = 0; skb_queue_head_init(&freeme); spin_lock_bh(&queue->lock); skb_queue_walk_safe(queue, skb, tmp) { if (vsock_sk(skb->sk) != vsk) continue; __skb_unlink(skb, queue); __skb_queue_tail(&freeme, skb); if (virtio_vsock_skb_reply(skb)) cnt++; } spin_unlock_bh(&queue->lock); __skb_queue_purge(&freeme); return cnt; } EXPORT_SYMBOL_GPL(virtio_transport_purge_skbs); int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_actor) { struct virtio_vsock_sock *vvs = vsk->trans; struct sock *sk = sk_vsock(vsk); struct sk_buff *skb; int off = 0; int err; spin_lock_bh(&vvs->rx_lock); /* Use __skb_recv_datagram() for race-free handling of the receive. It * works for types other than dgrams. */ skb = __skb_recv_datagram(sk, &vvs->rx_queue, MSG_DONTWAIT, &off, &err); spin_unlock_bh(&vvs->rx_lock); if (!skb) return err; return recv_actor(sk, skb); } EXPORT_SYMBOL_GPL(virtio_transport_read_skb); int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val) { struct virtio_vsock_sock *vvs = vsk->trans; bool send_update; spin_lock_bh(&vvs->rx_lock); /* If number of available bytes is less than new SO_RCVLOWAT value, * kick sender to send more data, because sender may sleep in its * 'send()' syscall waiting for enough space at our side. Also * don't send credit update when peer already knows actual value - * such transmission will be useless. */ send_update = (vvs->rx_bytes < val) && (vvs->fwd_cnt != vvs->last_fwd_cnt); spin_unlock_bh(&vvs->rx_lock); if (send_update) { int err; err = virtio_transport_send_credit_update(vsk); if (err < 0) return err; } return 0; } EXPORT_SYMBOL_GPL(virtio_transport_notify_set_rcvlowat); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Asias He"); MODULE_DESCRIPTION("common code for virtio vsock");
516 515 464 464 464 464 1 12 464 311 312 7 7 7 459 459 314 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 #include <linux/atomic.h> #include <linux/export.h> #include <linux/generic-radix-tree.h> #include <linux/gfp.h> #include <linux/kmemleak.h> #define GENRADIX_ARY (PAGE_SIZE / sizeof(struct genradix_node *)) #define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY) struct genradix_node { union { /* Interior node: */ struct genradix_node *children[GENRADIX_ARY]; /* Leaf: */ u8 data[PAGE_SIZE]; }; }; static inline int genradix_depth_shift(unsigned depth) { return PAGE_SHIFT + GENRADIX_ARY_SHIFT * depth; } /* * Returns size (of data, in bytes) that a tree of a given depth holds: */ static inline size_t genradix_depth_size(unsigned depth) { return 1UL << genradix_depth_shift(depth); } /* depth that's needed for a genradix that can address up to ULONG_MAX: */ #define GENRADIX_MAX_DEPTH \ DIV_ROUND_UP(BITS_PER_LONG - PAGE_SHIFT, GENRADIX_ARY_SHIFT) #define GENRADIX_DEPTH_MASK \ ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1)) static inline unsigned genradix_root_to_depth(struct genradix_root *r) { return (unsigned long) r & GENRADIX_DEPTH_MASK; } static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r) { return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK); } /* * Returns pointer to the specified byte @offset within @radix, or NULL if not * allocated */ void *__genradix_ptr(struct __genradix *radix, size_t offset) { struct genradix_root *r = READ_ONCE(radix->root); struct genradix_node *n = genradix_root_to_node(r); unsigned level = genradix_root_to_depth(r); if (ilog2(offset) >= genradix_depth_shift(level)) return NULL; while (1) { if (!n) return NULL; if (!level) break; level--; n = n->children[offset >> genradix_depth_shift(level)]; offset &= genradix_depth_size(level) - 1; } return &n->data[offset]; } EXPORT_SYMBOL(__genradix_ptr); static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask) { struct genradix_node *node; node = (struct genradix_node *)__get_free_page(gfp_mask|__GFP_ZERO); /* * We're using pages (not slab allocations) directly for kernel data * structures, so we need to explicitly inform kmemleak of them in order * to avoid false positive memory leak reports. */ kmemleak_alloc(node, PAGE_SIZE, 1, gfp_mask); return node; } static inline void genradix_free_node(struct genradix_node *node) { kmemleak_free(node); free_page((unsigned long)node); } /* * Returns pointer to the specified byte @offset within @radix, allocating it if * necessary - newly allocated slots are always zeroed out: */ void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset, gfp_t gfp_mask) { struct genradix_root *v = READ_ONCE(radix->root); struct genradix_node *n, *new_node = NULL; unsigned level; /* Increase tree depth if necessary: */ while (1) { struct genradix_root *r = v, *new_root; n = genradix_root_to_node(r); level = genradix_root_to_depth(r); if (n && ilog2(offset) < genradix_depth_shift(level)) break; if (!new_node) { new_node = genradix_alloc_node(gfp_mask); if (!new_node) return NULL; } new_node->children[0] = n; new_root = ((struct genradix_root *) ((unsigned long) new_node | (n ? level + 1 : 0))); if ((v = cmpxchg_release(&radix->root, r, new_root)) == r) { v = new_root; new_node = NULL; } } while (level--) { struct genradix_node **p = &n->children[offset >> genradix_depth_shift(level)]; offset &= genradix_depth_size(level) - 1; n = READ_ONCE(*p); if (!n) { if (!new_node) { new_node = genradix_alloc_node(gfp_mask); if (!new_node) return NULL; } if (!(n = cmpxchg_release(p, NULL, new_node))) swap(n, new_node); } } if (new_node) genradix_free_node(new_node); return &n->data[offset]; } EXPORT_SYMBOL(__genradix_ptr_alloc); void *__genradix_iter_peek(struct genradix_iter *iter, struct __genradix *radix, size_t objs_per_page) { struct genradix_root *r; struct genradix_node *n; unsigned level, i; if (iter->offset == SIZE_MAX) return NULL; restart: r = READ_ONCE(radix->root); if (!r) return NULL; n = genradix_root_to_node(r); level = genradix_root_to_depth(r); if (ilog2(iter->offset) >= genradix_depth_shift(level)) return NULL; while (level) { level--; i = (iter->offset >> genradix_depth_shift(level)) & (GENRADIX_ARY - 1); while (!n->children[i]) { size_t objs_per_ptr = genradix_depth_size(level); if (iter->offset + objs_per_ptr < iter->offset) { iter->offset = SIZE_MAX; iter->pos = SIZE_MAX; return NULL; } i++; iter->offset = round_down(iter->offset + objs_per_ptr, objs_per_ptr); iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page; if (i == GENRADIX_ARY) goto restart; } n = n->children[i]; } return &n->data[iter->offset & (PAGE_SIZE - 1)]; } EXPORT_SYMBOL(__genradix_iter_peek); void *__genradix_iter_peek_prev(struct genradix_iter *iter, struct __genradix *radix, size_t objs_per_page, size_t obj_size_plus_page_remainder) { struct genradix_root *r; struct genradix_node *n; unsigned level, i; if (iter->offset == SIZE_MAX) return NULL; restart: r = READ_ONCE(radix->root); if (!r) return NULL; n = genradix_root_to_node(r); level = genradix_root_to_depth(r); if (ilog2(iter->offset) >= genradix_depth_shift(level)) { iter->offset = genradix_depth_size(level); iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page; iter->offset -= obj_size_plus_page_remainder; iter->pos--; } while (level) { level--; i = (iter->offset >> genradix_depth_shift(level)) & (GENRADIX_ARY - 1); while (!n->children[i]) { size_t objs_per_ptr = genradix_depth_size(level); iter->offset = round_down(iter->offset, objs_per_ptr); iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page; if (!iter->offset) return NULL; iter->offset -= obj_size_plus_page_remainder; iter->pos--; if (!i) goto restart; --i; } n = n->children[i]; } return &n->data[iter->offset & (PAGE_SIZE - 1)]; } EXPORT_SYMBOL(__genradix_iter_peek_prev); static void genradix_free_recurse(struct genradix_node *n, unsigned level) { if (level) { unsigned i; for (i = 0; i < GENRADIX_ARY; i++) if (n->children[i]) genradix_free_recurse(n->children[i], level - 1); } genradix_free_node(n); } int __genradix_prealloc(struct __genradix *radix, size_t size, gfp_t gfp_mask) { size_t offset; for (offset = 0; offset < size; offset += PAGE_SIZE) if (!__genradix_ptr_alloc(radix, offset, gfp_mask)) return -ENOMEM; return 0; } EXPORT_SYMBOL(__genradix_prealloc); void __genradix_free(struct __genradix *radix) { struct genradix_root *r = xchg(&radix->root, NULL); genradix_free_recurse(genradix_root_to_node(r), genradix_root_to_depth(r)); } EXPORT_SYMBOL(__genradix_free);
1187 1190 187 187 3080 3084 10844 10859 16 16 190 190 14350 14368 6201 493 6151 79 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 // SPDX-License-Identifier: GPL-2.0-or-later /* bit search implementation * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * Copyright (C) 2008 IBM Corporation * 'find_last_bit' is written by Rusty Russell <rusty@rustcorp.com.au> * (Inspired by David Howell's find_next_bit implementation) * * Rewritten by Yury Norov <yury.norov@gmail.com> to decrease * size and improve performance, 2015. */ #include <linux/bitops.h> #include <linux/bitmap.h> #include <linux/export.h> #include <linux/math.h> #include <linux/minmax.h> #include <linux/swab.h> /* * Common helper for find_bit() function family * @FETCH: The expression that fetches and pre-processes each word of bitmap(s) * @MUNGE: The expression that post-processes a word containing found bit (may be empty) * @size: The bitmap size in bits */ #define FIND_FIRST_BIT(FETCH, MUNGE, size) \ ({ \ unsigned long idx, val, sz = (size); \ \ for (idx = 0; idx * BITS_PER_LONG < sz; idx++) { \ val = (FETCH); \ if (val) { \ sz = min(idx * BITS_PER_LONG + __ffs(MUNGE(val)), sz); \ break; \ } \ } \ \ sz; \ }) /* * Common helper for find_next_bit() function family * @FETCH: The expression that fetches and pre-processes each word of bitmap(s) * @MUNGE: The expression that post-processes a word containing found bit (may be empty) * @size: The bitmap size in bits * @start: The bitnumber to start searching at */ #define FIND_NEXT_BIT(FETCH, MUNGE, size, start) \ ({ \ unsigned long mask, idx, tmp, sz = (size), __start = (start); \ \ if (unlikely(__start >= sz)) \ goto out; \ \ mask = MUNGE(BITMAP_FIRST_WORD_MASK(__start)); \ idx = __start / BITS_PER_LONG; \ \ for (tmp = (FETCH) & mask; !tmp; tmp = (FETCH)) { \ if ((idx + 1) * BITS_PER_LONG >= sz) \ goto out; \ idx++; \ } \ \ sz = min(idx * BITS_PER_LONG + __ffs(MUNGE(tmp)), sz); \ out: \ sz; \ }) #define FIND_NTH_BIT(FETCH, size, num) \ ({ \ unsigned long sz = (size), nr = (num), idx, w, tmp; \ \ for (idx = 0; (idx + 1) * BITS_PER_LONG <= sz; idx++) { \ if (idx * BITS_PER_LONG + nr >= sz) \ goto out; \ \ tmp = (FETCH); \ w = hweight_long(tmp); \ if (w > nr) \ goto found; \ \ nr -= w; \ } \ \ if (sz % BITS_PER_LONG) \ tmp = (FETCH) & BITMAP_LAST_WORD_MASK(sz); \ found: \ sz = min(idx * BITS_PER_LONG + fns(tmp, nr), sz); \ out: \ sz; \ }) #ifndef find_first_bit /* * Find the first set bit in a memory region. */ unsigned long _find_first_bit(const unsigned long *addr, unsigned long size) { return FIND_FIRST_BIT(addr[idx], /* nop */, size); } EXPORT_SYMBOL(_find_first_bit); #endif #ifndef find_first_and_bit /* * Find the first set bit in two memory regions. */ unsigned long _find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size) { return FIND_FIRST_BIT(addr1[idx] & addr2[idx], /* nop */, size); } EXPORT_SYMBOL(_find_first_and_bit); #endif #ifndef find_first_zero_bit /* * Find the first cleared bit in a memory region. */ unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size) { return FIND_FIRST_BIT(~addr[idx], /* nop */, size); } EXPORT_SYMBOL(_find_first_zero_bit); #endif #ifndef find_next_bit unsigned long _find_next_bit(const unsigned long *addr, unsigned long nbits, unsigned long start) { return FIND_NEXT_BIT(addr[idx], /* nop */, nbits, start); } EXPORT_SYMBOL(_find_next_bit); #endif unsigned long __find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n) { return FIND_NTH_BIT(addr[idx], size, n); } EXPORT_SYMBOL(__find_nth_bit); unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n) { return FIND_NTH_BIT(addr1[idx] & addr2[idx], size, n); } EXPORT_SYMBOL(__find_nth_and_bit); unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n) { return FIND_NTH_BIT(addr1[idx] & ~addr2[idx], size, n); } EXPORT_SYMBOL(__find_nth_andnot_bit); unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size, unsigned long n) { return FIND_NTH_BIT(addr1[idx] & addr2[idx] & ~addr3[idx], size, n); } EXPORT_SYMBOL(__find_nth_and_andnot_bit); #ifndef find_next_and_bit unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start) { return FIND_NEXT_BIT(addr1[idx] & addr2[idx], /* nop */, nbits, start); } EXPORT_SYMBOL(_find_next_and_bit); #endif #ifndef find_next_andnot_bit unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start) { return FIND_NEXT_BIT(addr1[idx] & ~addr2[idx], /* nop */, nbits, start); } EXPORT_SYMBOL(_find_next_andnot_bit); #endif #ifndef find_next_or_bit unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start) { return FIND_NEXT_BIT(addr1[idx] | addr2[idx], /* nop */, nbits, start); } EXPORT_SYMBOL(_find_next_or_bit); #endif #ifndef find_next_zero_bit unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits, unsigned long start) { return FIND_NEXT_BIT(~addr[idx], /* nop */, nbits, start); } EXPORT_SYMBOL(_find_next_zero_bit); #endif #ifndef find_last_bit unsigned long _find_last_bit(const unsigned long *addr, unsigned long size) { if (size) { unsigned long val = BITMAP_LAST_WORD_MASK(size); unsigned long idx = (size-1) / BITS_PER_LONG; do { val &= addr[idx]; if (val) return idx * BITS_PER_LONG + __fls(val); val = ~0ul; } while (idx--); } return size; } EXPORT_SYMBOL(_find_last_bit); #endif unsigned long find_next_clump8(unsigned long *clump, const unsigned long *addr, unsigned long size, unsigned long offset) { offset = find_next_bit(addr, size, offset); if (offset == size) return size; offset = round_down(offset, 8); *clump = bitmap_get_value8(addr, offset); return offset; } EXPORT_SYMBOL(find_next_clump8); #ifdef __BIG_ENDIAN #ifndef find_first_zero_bit_le /* * Find the first cleared bit in an LE memory region. */ unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size) { return FIND_FIRST_BIT(~addr[idx], swab, size); } EXPORT_SYMBOL(_find_first_zero_bit_le); #endif #ifndef find_next_zero_bit_le unsigned long _find_next_zero_bit_le(const unsigned long *addr, unsigned long size, unsigned long offset) { return FIND_NEXT_BIT(~addr[idx], swab, size, offset); } EXPORT_SYMBOL(_find_next_zero_bit_le); #endif #ifndef find_next_bit_le unsigned long _find_next_bit_le(const unsigned long *addr, unsigned long size, unsigned long offset) { return FIND_NEXT_BIT(addr[idx], swab, size, offset); } EXPORT_SYMBOL(_find_next_bit_le); #endif #endif /* __BIG_ENDIAN */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __IEEE802154_CORE_H #define __IEEE802154_CORE_H #include <net/cfg802154.h> struct cfg802154_registered_device { const struct cfg802154_ops *ops; struct list_head list; /* wpan_phy index, internal only */ int wpan_phy_idx; /* also protected by devlist_mtx */ int opencount; wait_queue_head_t dev_wait; /* protected by RTNL only */ int num_running_ifaces; /* associated wpan interfaces, protected by rtnl or RCU */ struct list_head wpan_dev_list; int devlist_generation, wpan_dev_id; /* must be last because of the way we do wpan_phy_priv(), * and it should at least be aligned to NETDEV_ALIGN */ struct wpan_phy wpan_phy __aligned(NETDEV_ALIGN); }; static inline struct cfg802154_registered_device * wpan_phy_to_rdev(struct wpan_phy *wpan_phy) { BUG_ON(!wpan_phy); return container_of(wpan_phy, struct cfg802154_registered_device, wpan_phy); } extern struct list_head cfg802154_rdev_list; extern int cfg802154_rdev_list_generation; int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, struct net *net); /* free object */ void cfg802154_dev_free(struct cfg802154_registered_device *rdev); struct cfg802154_registered_device * cfg802154_rdev_by_wpan_phy_idx(int wpan_phy_idx); struct wpan_phy *wpan_phy_idx_to_wpan_phy(int wpan_phy_idx); #endif /* __IEEE802154_CORE_H */
5000 5002 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PGALLOC_TRACK_H #define _LINUX_PGALLOC_TRACK_H #if defined(CONFIG_MMU) static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd, unsigned long address, pgtbl_mod_mask *mod_mask) { if (unlikely(pgd_none(*pgd))) { if (__p4d_alloc(mm, pgd, address)) return NULL; *mod_mask |= PGTBL_PGD_MODIFIED; } return p4d_offset(pgd, address); } static inline pud_t *pud_alloc_track(struct mm_struct *mm, p4d_t *p4d, unsigned long address, pgtbl_mod_mask *mod_mask) { if (unlikely(p4d_none(*p4d))) { if (__pud_alloc(mm, p4d, address)) return NULL; *mod_mask |= PGTBL_P4D_MODIFIED; } return pud_offset(p4d, address); } static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud, unsigned long address, pgtbl_mod_mask *mod_mask) { if (unlikely(pud_none(*pud))) { if (__pmd_alloc(mm, pud, address)) return NULL; *mod_mask |= PGTBL_PUD_MODIFIED; } return pmd_offset(pud, address); } #endif /* CONFIG_MMU */ #define pte_alloc_kernel_track(pmd, address, mask) \ ((unlikely(pmd_none(*(pmd))) && \ (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\ NULL: pte_offset_kernel(pmd, address)) #endif /* _LINUX_PGALLOC_TRACK_H */
34 34 2 87 84 16 1013 1 1010 1 3 2 114 887 888 276 859 1 867 4 1 1 865 64 9 33 554 1 1 4 6 849 626 161 2 722 1 1 867 3 1877 3 4 1 1013 6 4 6 12 5 11 8 6 4 2 2 407 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 /* Netfilter messages via netlink socket. Allows for user space * protocol helpers and general trouble making from userspace. * * (C) 2001 by Jay Schulist <jschlst@samba.org>, * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org> * (C) 2005-2017 by Pablo Neira Ayuso <pablo@netfilter.org> * * Initial netfilter messages via netlink development funded and * generally made possible by Network Robots, Inc. (www.networkrobots.com) * * Further development of this code funded by Astaro AG (http://www.astaro.com) * * This software may be used and distributed according to the terms * of the GNU General Public License, incorporated herein by reference. */ #include <linux/module.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/uaccess.h> #include <net/sock.h> #include <linux/init.h> #include <linux/sched/signal.h> #include <net/netlink.h> #include <net/netns/generic.h> #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER); MODULE_DESCRIPTION("Netfilter messages via netlink socket"); #define nfnl_dereference_protected(id) \ rcu_dereference_protected(table[(id)].subsys, \ lockdep_nfnl_is_held((id))) #define NFNL_MAX_ATTR_COUNT 32 static unsigned int nfnetlink_pernet_id __read_mostly; #ifdef CONFIG_NF_CONNTRACK_EVENTS static DEFINE_SPINLOCK(nfnl_grp_active_lock); #endif struct nfnl_net { struct sock *nfnl; }; static struct { struct mutex mutex; const struct nfnetlink_subsystem __rcu *subsys; } table[NFNL_SUBSYS_COUNT]; static struct lock_class_key nfnl_lockdep_keys[NFNL_SUBSYS_COUNT]; static const char *const nfnl_lockdep_names[NFNL_SUBSYS_COUNT] = { [NFNL_SUBSYS_NONE] = "nfnl_subsys_none", [NFNL_SUBSYS_CTNETLINK] = "nfnl_subsys_ctnetlink", [NFNL_SUBSYS_CTNETLINK_EXP] = "nfnl_subsys_ctnetlink_exp", [NFNL_SUBSYS_QUEUE] = "nfnl_subsys_queue", [NFNL_SUBSYS_ULOG] = "nfnl_subsys_ulog", [NFNL_SUBSYS_OSF] = "nfnl_subsys_osf", [NFNL_SUBSYS_IPSET] = "nfnl_subsys_ipset", [NFNL_SUBSYS_ACCT] = "nfnl_subsys_acct", [NFNL_SUBSYS_CTNETLINK_TIMEOUT] = "nfnl_subsys_cttimeout", [NFNL_SUBSYS_CTHELPER] = "nfnl_subsys_cthelper", [NFNL_SUBSYS_NFTABLES] = "nfnl_subsys_nftables", [NFNL_SUBSYS_NFT_COMPAT] = "nfnl_subsys_nftcompat", [NFNL_SUBSYS_HOOK] = "nfnl_subsys_hook", }; static const int nfnl_group2type[NFNLGRP_MAX+1] = { [NFNLGRP_CONNTRACK_NEW] = NFNL_SUBSYS_CTNETLINK, [NFNLGRP_CONNTRACK_UPDATE] = NFNL_SUBSYS_CTNETLINK, [NFNLGRP_CONNTRACK_DESTROY] = NFNL_SUBSYS_CTNETLINK, [NFNLGRP_CONNTRACK_EXP_NEW] = NFNL_SUBSYS_CTNETLINK_EXP, [NFNLGRP_CONNTRACK_EXP_UPDATE] = NFNL_SUBSYS_CTNETLINK_EXP, [NFNLGRP_CONNTRACK_EXP_DESTROY] = NFNL_SUBSYS_CTNETLINK_EXP, [NFNLGRP_NFTABLES] = NFNL_SUBSYS_NFTABLES, [NFNLGRP_ACCT_QUOTA] = NFNL_SUBSYS_ACCT, [NFNLGRP_NFTRACE] = NFNL_SUBSYS_NFTABLES, }; static struct nfnl_net *nfnl_pernet(struct net *net) { return net_generic(net, nfnetlink_pernet_id); } void nfnl_lock(__u8 subsys_id) { mutex_lock(&table[subsys_id].mutex); } EXPORT_SYMBOL_GPL(nfnl_lock); void nfnl_unlock(__u8 subsys_id) { mutex_unlock(&table[subsys_id].mutex); } EXPORT_SYMBOL_GPL(nfnl_unlock); #ifdef CONFIG_PROVE_LOCKING bool lockdep_nfnl_is_held(u8 subsys_id) { return lockdep_is_held(&table[subsys_id].mutex); } EXPORT_SYMBOL_GPL(lockdep_nfnl_is_held); #endif int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n) { u8 cb_id; /* Sanity-check attr_count size to avoid stack buffer overflow. */ for (cb_id = 0; cb_id < n->cb_count; cb_id++) if (WARN_ON(n->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT)) return -EINVAL; nfnl_lock(n->subsys_id); if (table[n->subsys_id].subsys) { nfnl_unlock(n->subsys_id); return -EBUSY; } rcu_assign_pointer(table[n->subsys_id].subsys, n); nfnl_unlock(n->subsys_id); return 0; } EXPORT_SYMBOL_GPL(nfnetlink_subsys_register); int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n) { nfnl_lock(n->subsys_id); table[n->subsys_id].subsys = NULL; nfnl_unlock(n->subsys_id); synchronize_rcu(); return 0; } EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister); static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u16 type) { u8 subsys_id = NFNL_SUBSYS_ID(type); if (subsys_id >= NFNL_SUBSYS_COUNT) return NULL; return rcu_dereference(table[subsys_id].subsys); } static inline const struct nfnl_callback * nfnetlink_find_client(u16 type, const struct nfnetlink_subsystem *ss) { u8 cb_id = NFNL_MSG_TYPE(type); if (cb_id >= ss->cb_count) return NULL; return &ss->cb[cb_id]; } int nfnetlink_has_listeners(struct net *net, unsigned int group) { struct nfnl_net *nfnlnet = nfnl_pernet(net); return netlink_has_listeners(nfnlnet->nfnl, group); } EXPORT_SYMBOL_GPL(nfnetlink_has_listeners); int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, unsigned int group, int echo, gfp_t flags) { struct nfnl_net *nfnlnet = nfnl_pernet(net); return nlmsg_notify(nfnlnet->nfnl, skb, portid, group, echo, flags); } EXPORT_SYMBOL_GPL(nfnetlink_send); int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error) { struct nfnl_net *nfnlnet = nfnl_pernet(net); return netlink_set_err(nfnlnet->nfnl, portid, group, error); } EXPORT_SYMBOL_GPL(nfnetlink_set_err); int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid) { struct nfnl_net *nfnlnet = nfnl_pernet(net); int err; err = nlmsg_unicast(nfnlnet->nfnl, skb, portid); if (err == -EAGAIN) err = -ENOBUFS; return err; } EXPORT_SYMBOL_GPL(nfnetlink_unicast); void nfnetlink_broadcast(struct net *net, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation) { struct nfnl_net *nfnlnet = nfnl_pernet(net); netlink_broadcast(nfnlnet->nfnl, skb, portid, group, allocation); } EXPORT_SYMBOL_GPL(nfnetlink_broadcast); /* Process one complete nfnetlink message. */ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); const struct nfnl_callback *nc; const struct nfnetlink_subsystem *ss; int type, err; /* All the messages must at least contain nfgenmsg */ if (nlmsg_len(nlh) < sizeof(struct nfgenmsg)) return 0; type = nlh->nlmsg_type; replay: rcu_read_lock(); ss = nfnetlink_get_subsys(type); if (!ss) { #ifdef CONFIG_MODULES rcu_read_unlock(); request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type)); rcu_read_lock(); ss = nfnetlink_get_subsys(type); if (!ss) #endif { rcu_read_unlock(); return -EINVAL; } } nc = nfnetlink_find_client(type, ss); if (!nc) { rcu_read_unlock(); return -EINVAL; } { int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); struct nfnl_net *nfnlnet = nfnl_pernet(net); u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1]; struct nlattr *attr = (void *)nlh + min_len; int attrlen = nlh->nlmsg_len - min_len; __u8 subsys_id = NFNL_SUBSYS_ID(type); struct nfnl_info info = { .net = net, .sk = nfnlnet->nfnl, .nlh = nlh, .nfmsg = nlmsg_data(nlh), .extack = extack, }; /* Sanity-check NFNL_MAX_ATTR_COUNT */ if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) { rcu_read_unlock(); return -ENOMEM; } err = nla_parse_deprecated(cda, ss->cb[cb_id].attr_count, attr, attrlen, ss->cb[cb_id].policy, extack); if (err < 0) { rcu_read_unlock(); return err; } if (!nc->call) { rcu_read_unlock(); return -EINVAL; } switch (nc->type) { case NFNL_CB_RCU: err = nc->call(skb, &info, (const struct nlattr **)cda); rcu_read_unlock(); break; case NFNL_CB_MUTEX: rcu_read_unlock(); nfnl_lock(subsys_id); if (nfnl_dereference_protected(subsys_id) != ss || nfnetlink_find_client(type, ss) != nc) { nfnl_unlock(subsys_id); err = -EAGAIN; break; } err = nc->call(skb, &info, (const struct nlattr **)cda); nfnl_unlock(subsys_id); break; default: rcu_read_unlock(); err = -EINVAL; break; } if (err == -EAGAIN) goto replay; return err; } } struct nfnl_err { struct list_head head; struct nlmsghdr *nlh; int err; struct netlink_ext_ack extack; }; static int nfnl_err_add(struct list_head *list, struct nlmsghdr *nlh, int err, const struct netlink_ext_ack *extack) { struct nfnl_err *nfnl_err; nfnl_err = kmalloc(sizeof(struct nfnl_err), GFP_KERNEL); if (nfnl_err == NULL) return -ENOMEM; nfnl_err->nlh = nlh; nfnl_err->err = err; nfnl_err->extack = *extack; list_add_tail(&nfnl_err->head, list); return 0; } static void nfnl_err_del(struct nfnl_err *nfnl_err) { list_del(&nfnl_err->head); kfree(nfnl_err); } static void nfnl_err_reset(struct list_head *err_list) { struct nfnl_err *nfnl_err, *next; list_for_each_entry_safe(nfnl_err, next, err_list, head) nfnl_err_del(nfnl_err); } static void nfnl_err_deliver(struct list_head *err_list, struct sk_buff *skb) { struct nfnl_err *nfnl_err, *next; list_for_each_entry_safe(nfnl_err, next, err_list, head) { netlink_ack(skb, nfnl_err->nlh, nfnl_err->err, &nfnl_err->extack); nfnl_err_del(nfnl_err); } } enum { NFNL_BATCH_FAILURE = (1 << 0), NFNL_BATCH_DONE = (1 << 1), NFNL_BATCH_REPLAY = (1 << 2), }; static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, u16 subsys_id, u32 genid) { struct sk_buff *oskb = skb; struct net *net = sock_net(skb->sk); const struct nfnetlink_subsystem *ss; const struct nfnl_callback *nc; struct netlink_ext_ack extack; LIST_HEAD(err_list); u32 status; int err; if (subsys_id >= NFNL_SUBSYS_COUNT) return netlink_ack(skb, nlh, -EINVAL, NULL); replay: status = 0; replay_abort: skb = netlink_skb_clone(oskb, GFP_KERNEL); if (!skb) return netlink_ack(oskb, nlh, -ENOMEM, NULL); nfnl_lock(subsys_id); ss = nfnl_dereference_protected(subsys_id); if (!ss) { #ifdef CONFIG_MODULES nfnl_unlock(subsys_id); request_module("nfnetlink-subsys-%d", subsys_id); nfnl_lock(subsys_id); ss = nfnl_dereference_protected(subsys_id); if (!ss) #endif { nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); return kfree_skb(skb); } } if (!ss->valid_genid || !ss->commit || !ss->abort) { nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); return kfree_skb(skb); } if (!try_module_get(ss->owner)) { nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); return kfree_skb(skb); } if (!ss->valid_genid(net, genid)) { module_put(ss->owner); nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -ERESTART, NULL); return kfree_skb(skb); } nfnl_unlock(subsys_id); while (skb->len >= nlmsg_total_size(0)) { int msglen, type; if (fatal_signal_pending(current)) { nfnl_err_reset(&err_list); err = -EINTR; status = NFNL_BATCH_FAILURE; goto done; } memset(&extack, 0, sizeof(extack)); nlh = nlmsg_hdr(skb); err = 0; if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len || nlmsg_len(nlh) < sizeof(struct nfgenmsg)) { nfnl_err_reset(&err_list); status |= NFNL_BATCH_FAILURE; goto done; } /* Only requests are handled by the kernel */ if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) { err = -EINVAL; goto ack; } type = nlh->nlmsg_type; if (type == NFNL_MSG_BATCH_BEGIN) { /* Malformed: Batch begin twice */ nfnl_err_reset(&err_list); status |= NFNL_BATCH_FAILURE; goto done; } else if (type == NFNL_MSG_BATCH_END) { status |= NFNL_BATCH_DONE; goto done; } else if (type < NLMSG_MIN_TYPE) { err = -EINVAL; goto ack; } /* We only accept a batch with messages for the same * subsystem. */ if (NFNL_SUBSYS_ID(type) != subsys_id) { err = -EINVAL; goto ack; } nc = nfnetlink_find_client(type, ss); if (!nc) { err = -EINVAL; goto ack; } if (nc->type != NFNL_CB_BATCH) { err = -EINVAL; goto ack; } { int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); struct nfnl_net *nfnlnet = nfnl_pernet(net); struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1]; struct nlattr *attr = (void *)nlh + min_len; u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); int attrlen = nlh->nlmsg_len - min_len; struct nfnl_info info = { .net = net, .sk = nfnlnet->nfnl, .nlh = nlh, .nfmsg = nlmsg_data(nlh), .extack = &extack, }; /* Sanity-check NFTA_MAX_ATTR */ if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) { err = -ENOMEM; goto ack; } err = nla_parse_deprecated(cda, ss->cb[cb_id].attr_count, attr, attrlen, ss->cb[cb_id].policy, NULL); if (err < 0) goto ack; err = nc->call(skb, &info, (const struct nlattr **)cda); /* The lock was released to autoload some module, we * have to abort and start from scratch using the * original skb. */ if (err == -EAGAIN) { status |= NFNL_BATCH_REPLAY; goto done; } } ack: if (nlh->nlmsg_flags & NLM_F_ACK || err) { /* Errors are delivered once the full batch has been * processed, this avoids that the same error is * reported several times when replaying the batch. */ if (err == -ENOMEM || nfnl_err_add(&err_list, nlh, err, &extack) < 0) { /* We failed to enqueue an error, reset the * list of errors and send OOM to userspace * pointing to the batch header. */ nfnl_err_reset(&err_list); netlink_ack(oskb, nlmsg_hdr(oskb), -ENOMEM, NULL); status |= NFNL_BATCH_FAILURE; goto done; } /* We don't stop processing the batch on errors, thus, * userspace gets all the errors that the batch * triggers. */ if (err) status |= NFNL_BATCH_FAILURE; } msglen = NLMSG_ALIGN(nlh->nlmsg_len); if (msglen > skb->len) msglen = skb->len; skb_pull(skb, msglen); } done: if (status & NFNL_BATCH_REPLAY) { ss->abort(net, oskb, NFNL_ABORT_AUTOLOAD); nfnl_err_reset(&err_list); kfree_skb(skb); module_put(ss->owner); goto replay; } else if (status == NFNL_BATCH_DONE) { err = ss->commit(net, oskb); if (err == -EAGAIN) { status |= NFNL_BATCH_REPLAY; goto done; } else if (err) { ss->abort(net, oskb, NFNL_ABORT_NONE); netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL); } } else { enum nfnl_abort_action abort_action; if (status & NFNL_BATCH_FAILURE) abort_action = NFNL_ABORT_NONE; else abort_action = NFNL_ABORT_VALIDATE; err = ss->abort(net, oskb, abort_action); if (err == -EAGAIN) { nfnl_err_reset(&err_list); kfree_skb(skb); module_put(ss->owner); status |= NFNL_BATCH_FAILURE; goto replay_abort; } } nfnl_err_deliver(&err_list, oskb); kfree_skb(skb); module_put(ss->owner); } static const struct nla_policy nfnl_batch_policy[NFNL_BATCH_MAX + 1] = { [NFNL_BATCH_GENID] = { .type = NLA_U32 }, }; static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh) { int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); struct nlattr *attr = (void *)nlh + min_len; struct nlattr *cda[NFNL_BATCH_MAX + 1]; int attrlen = nlh->nlmsg_len - min_len; struct nfgenmsg *nfgenmsg; int msglen, err; u32 gen_id = 0; u16 res_id; msglen = NLMSG_ALIGN(nlh->nlmsg_len); if (msglen > skb->len) msglen = skb->len; if (skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg)) return; err = nla_parse_deprecated(cda, NFNL_BATCH_MAX, attr, attrlen, nfnl_batch_policy, NULL); if (err < 0) { netlink_ack(skb, nlh, err, NULL); return; } if (cda[NFNL_BATCH_GENID]) gen_id = ntohl(nla_get_be32(cda[NFNL_BATCH_GENID])); nfgenmsg = nlmsg_data(nlh); skb_pull(skb, msglen); /* Work around old nft using host byte order */ if (nfgenmsg->res_id == (__force __be16)NFNL_SUBSYS_NFTABLES) res_id = NFNL_SUBSYS_NFTABLES; else res_id = ntohs(nfgenmsg->res_id); nfnetlink_rcv_batch(skb, nlh, res_id, gen_id); } static void nfnetlink_rcv(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); if (skb->len < NLMSG_HDRLEN || nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) return; if (!netlink_net_capable(skb, CAP_NET_ADMIN)) { netlink_ack(skb, nlh, -EPERM, NULL); return; } if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN) nfnetlink_rcv_skb_batch(skb, nlh); else netlink_rcv_skb(skb, nfnetlink_rcv_msg); } static void nfnetlink_bind_event(struct net *net, unsigned int group) { #ifdef CONFIG_NF_CONNTRACK_EVENTS int type, group_bit; u8 v; /* All NFNLGRP_CONNTRACK_* group bits fit into u8. * The other groups are not relevant and can be ignored. */ if (group >= 8) return; type = nfnl_group2type[group]; switch (type) { case NFNL_SUBSYS_CTNETLINK: break; case NFNL_SUBSYS_CTNETLINK_EXP: break; default: return; } group_bit = (1 << group); spin_lock(&nfnl_grp_active_lock); v = READ_ONCE(nf_ctnetlink_has_listener); if ((v & group_bit) == 0) { v |= group_bit; /* read concurrently without nfnl_grp_active_lock held. */ WRITE_ONCE(nf_ctnetlink_has_listener, v); } spin_unlock(&nfnl_grp_active_lock); #endif } static int nfnetlink_bind(struct net *net, int group) { const struct nfnetlink_subsystem *ss; int type; if (group <= NFNLGRP_NONE || group > NFNLGRP_MAX) return 0; type = nfnl_group2type[group]; rcu_read_lock(); ss = nfnetlink_get_subsys(type << 8); rcu_read_unlock(); if (!ss) request_module_nowait("nfnetlink-subsys-%d", type); nfnetlink_bind_event(net, group); return 0; } static void nfnetlink_unbind(struct net *net, int group) { #ifdef CONFIG_NF_CONNTRACK_EVENTS int type, group_bit; if (group <= NFNLGRP_NONE || group > NFNLGRP_MAX) return; type = nfnl_group2type[group]; switch (type) { case NFNL_SUBSYS_CTNETLINK: break; case NFNL_SUBSYS_CTNETLINK_EXP: break; default: return; } /* ctnetlink_has_listener is u8 */ if (group >= 8) return; group_bit = (1 << group); spin_lock(&nfnl_grp_active_lock); if (!nfnetlink_has_listeners(net, group)) { u8 v = READ_ONCE(nf_ctnetlink_has_listener); v &= ~group_bit; /* read concurrently without nfnl_grp_active_lock held. */ WRITE_ONCE(nf_ctnetlink_has_listener, v); } spin_unlock(&nfnl_grp_active_lock); #endif } static int __net_init nfnetlink_net_init(struct net *net) { struct nfnl_net *nfnlnet = nfnl_pernet(net); struct netlink_kernel_cfg cfg = { .groups = NFNLGRP_MAX, .input = nfnetlink_rcv, .bind = nfnetlink_bind, .unbind = nfnetlink_unbind, }; nfnlnet->nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg); if (!nfnlnet->nfnl) return -ENOMEM; return 0; } static void __net_exit nfnetlink_net_exit_batch(struct list_head *net_exit_list) { struct nfnl_net *nfnlnet; struct net *net; list_for_each_entry(net, net_exit_list, exit_list) { nfnlnet = nfnl_pernet(net); netlink_kernel_release(nfnlnet->nfnl); } } static struct pernet_operations nfnetlink_net_ops = { .init = nfnetlink_net_init, .exit_batch = nfnetlink_net_exit_batch, .id = &nfnetlink_pernet_id, .size = sizeof(struct nfnl_net), }; static int __init nfnetlink_init(void) { int i; for (i = NFNLGRP_NONE + 1; i <= NFNLGRP_MAX; i++) BUG_ON(nfnl_group2type[i] == NFNL_SUBSYS_NONE); for (i=0; i<NFNL_SUBSYS_COUNT; i++) __mutex_init(&table[i].mutex, nfnl_lockdep_names[i], &nfnl_lockdep_keys[i]); return register_pernet_subsys(&nfnetlink_net_ops); } static void __exit nfnetlink_exit(void) { unregister_pernet_subsys(&nfnetlink_net_ops); } module_init(nfnetlink_init); module_exit(nfnetlink_exit);
55 1 6 6 2 4 3 3 6 6 6 2 2 2 2 1 1 2 2 3 3 3 3 54 54 2 54 54 54 54 56 56 57 56 1 57 1 1 55 51 5 56 56 56 56 55 56 56 56 1 14 50 54 53 54 53 54 54 13 41 1 1 2 2 2 2 1 1 2 2 54 53 2 2 1 1 2 2 2 1 1 2 1 4 1 1 1 1 3 3 3 1 3 1 2 2 2 1 1 1 1 2 1 2 1 1 3 1 1 1 4 2 2 1 1 1 1 1 1 8 1 2 1 1 3 3 3 4 4 2 2 1 1 4 1 3 1 3 2 1 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 // SPDX-License-Identifier: GPL-2.0-only /* * vivid-vid-cap.c - video capture support functions. * * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/vmalloc.h> #include <linux/videodev2.h> #include <linux/v4l2-dv-timings.h> #include <media/v4l2-common.h> #include <media/v4l2-event.h> #include <media/v4l2-dv-timings.h> #include <media/v4l2-rect.h> #include "vivid-core.h" #include "vivid-vid-common.h" #include "vivid-kthread-cap.h" #include "vivid-vid-cap.h" /* Sizes must be in increasing order */ static const struct v4l2_frmsize_discrete webcam_sizes[] = { { 320, 180 }, { 640, 360 }, { 640, 480 }, { 1280, 720 }, { 1920, 1080 }, { 3840, 2160 }, }; /* * Intervals must be in increasing order and there must be twice as many * elements in this array as there are in webcam_sizes. */ static const struct v4l2_fract webcam_intervals[] = { { 1, 1 }, { 1, 2 }, { 1, 4 }, { 1, 5 }, { 1, 10 }, { 2, 25 }, { 1, 15 }, /* 7 - maximum for 2160p */ { 1, 25 }, { 1, 30 }, /* 9 - maximum for 1080p */ { 1, 40 }, { 1, 50 }, { 1, 60 }, /* 12 - maximum for 720p */ { 1, 120 }, }; /* Limit maximum FPS rates for high resolutions */ #define IVAL_COUNT_720P 12 /* 720p and up is limited to 60 fps */ #define IVAL_COUNT_1080P 9 /* 1080p and up is limited to 30 fps */ #define IVAL_COUNT_2160P 7 /* 2160p and up is limited to 15 fps */ static inline unsigned int webcam_ival_count(const struct vivid_dev *dev, unsigned int frmsize_idx) { if (webcam_sizes[frmsize_idx].height >= 2160) return IVAL_COUNT_2160P; if (webcam_sizes[frmsize_idx].height >= 1080) return IVAL_COUNT_1080P; if (webcam_sizes[frmsize_idx].height >= 720) return IVAL_COUNT_720P; /* For low resolutions, allow all FPS rates */ return ARRAY_SIZE(webcam_intervals); } static int vid_cap_queue_setup(struct vb2_queue *vq, unsigned *nbuffers, unsigned *nplanes, unsigned sizes[], struct device *alloc_devs[]) { struct vivid_dev *dev = vb2_get_drv_priv(vq); unsigned buffers = tpg_g_buffers(&dev->tpg); unsigned h = dev->fmt_cap_rect.height; unsigned p; if (dev->field_cap == V4L2_FIELD_ALTERNATE) { /* * You cannot use read() with FIELD_ALTERNATE since the field * information (TOP/BOTTOM) cannot be passed back to the user. */ if (vb2_fileio_is_active(vq)) return -EINVAL; } if (dev->queue_setup_error) { /* * Error injection: test what happens if queue_setup() returns * an error. */ dev->queue_setup_error = false; return -EINVAL; } if (*nplanes) { /* * Check if the number of requested planes match * the number of buffers in the current format. You can't mix that. */ if (*nplanes != buffers) return -EINVAL; for (p = 0; p < buffers; p++) { if (sizes[p] < tpg_g_line_width(&dev->tpg, p) * h + dev->fmt_cap->data_offset[p]) return -EINVAL; } } else { for (p = 0; p < buffers; p++) sizes[p] = (tpg_g_line_width(&dev->tpg, p) * h) / dev->fmt_cap->vdownsampling[p] + dev->fmt_cap->data_offset[p]; } *nplanes = buffers; dprintk(dev, 1, "%s: count=%d\n", __func__, *nbuffers); for (p = 0; p < buffers; p++) dprintk(dev, 1, "%s: size[%u]=%u\n", __func__, p, sizes[p]); return 0; } static int vid_cap_buf_prepare(struct vb2_buffer *vb) { struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); unsigned long size; unsigned buffers = tpg_g_buffers(&dev->tpg); unsigned p; dprintk(dev, 1, "%s\n", __func__); if (WARN_ON(NULL == dev->fmt_cap)) return -EINVAL; if (dev->buf_prepare_error) { /* * Error injection: test what happens if buf_prepare() returns * an error. */ dev->buf_prepare_error = false; return -EINVAL; } for (p = 0; p < buffers; p++) { size = (tpg_g_line_width(&dev->tpg, p) * dev->fmt_cap_rect.height) / dev->fmt_cap->vdownsampling[p] + dev->fmt_cap->data_offset[p]; if (vb2_plane_size(vb, p) < size) { dprintk(dev, 1, "%s data will not fit into plane %u (%lu < %lu)\n", __func__, p, vb2_plane_size(vb, p), size); return -EINVAL; } vb2_set_plane_payload(vb, p, size); vb->planes[p].data_offset = dev->fmt_cap->data_offset[p]; } return 0; } static void vid_cap_buf_finish(struct vb2_buffer *vb) { struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); struct v4l2_timecode *tc = &vbuf->timecode; unsigned fps = 25; unsigned seq = vbuf->sequence; if (!vivid_is_sdtv_cap(dev)) return; /* * Set the timecode. Rarely used, so it is interesting to * test this. */ vbuf->flags |= V4L2_BUF_FLAG_TIMECODE; if (dev->std_cap[dev->input] & V4L2_STD_525_60) fps = 30; tc->type = (fps == 30) ? V4L2_TC_TYPE_30FPS : V4L2_TC_TYPE_25FPS; tc->flags = 0; tc->frames = seq % fps; tc->seconds = (seq / fps) % 60; tc->minutes = (seq / (60 * fps)) % 60; tc->hours = (seq / (60 * 60 * fps)) % 24; } static void vid_cap_buf_queue(struct vb2_buffer *vb) { struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); struct vivid_buffer *buf = container_of(vbuf, struct vivid_buffer, vb); dprintk(dev, 1, "%s\n", __func__); spin_lock(&dev->slock); list_add_tail(&buf->list, &dev->vid_cap_active); spin_unlock(&dev->slock); } static int vid_cap_start_streaming(struct vb2_queue *vq, unsigned count) { struct vivid_dev *dev = vb2_get_drv_priv(vq); unsigned i; int err; if (vb2_is_streaming(&dev->vb_vid_out_q)) dev->can_loop_video = vivid_vid_can_loop(dev); dev->vid_cap_seq_count = 0; dprintk(dev, 1, "%s\n", __func__); for (i = 0; i < VIDEO_MAX_FRAME; i++) dev->must_blank[i] = tpg_g_perc_fill(&dev->tpg) < 100; if (dev->start_streaming_error) { dev->start_streaming_error = false; err = -EINVAL; } else { err = vivid_start_generating_vid_cap(dev, &dev->vid_cap_streaming); } if (err) { struct vivid_buffer *buf, *tmp; list_for_each_entry_safe(buf, tmp, &dev->vid_cap_active, list) { list_del(&buf->list); vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_QUEUED); } } return err; } /* abort streaming and wait for last buffer */ static void vid_cap_stop_streaming(struct vb2_queue *vq) { struct vivid_dev *dev = vb2_get_drv_priv(vq); dprintk(dev, 1, "%s\n", __func__); vivid_stop_generating_vid_cap(dev, &dev->vid_cap_streaming); dev->can_loop_video = false; } static void vid_cap_buf_request_complete(struct vb2_buffer *vb) { struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); v4l2_ctrl_request_complete(vb->req_obj.req, &dev->ctrl_hdl_vid_cap); } const struct vb2_ops vivid_vid_cap_qops = { .queue_setup = vid_cap_queue_setup, .buf_prepare = vid_cap_buf_prepare, .buf_finish = vid_cap_buf_finish, .buf_queue = vid_cap_buf_queue, .start_streaming = vid_cap_start_streaming, .stop_streaming = vid_cap_stop_streaming, .buf_request_complete = vid_cap_buf_request_complete, .wait_prepare = vb2_ops_wait_prepare, .wait_finish = vb2_ops_wait_finish, }; /* * Determine the 'picture' quality based on the current TV frequency: either * COLOR for a good 'signal', GRAY (grayscale picture) for a slightly off * signal or NOISE for no signal. */ void vivid_update_quality(struct vivid_dev *dev) { unsigned freq_modulus; if (dev->loop_video && (vivid_is_svid_cap(dev) || vivid_is_hdmi_cap(dev))) { /* * The 'noise' will only be replaced by the actual video * if the output video matches the input video settings. */ tpg_s_quality(&dev->tpg, TPG_QUAL_NOISE, 0); return; } if (vivid_is_hdmi_cap(dev) && VIVID_INVALID_SIGNAL(dev->dv_timings_signal_mode[dev->input])) { tpg_s_quality(&dev->tpg, TPG_QUAL_NOISE, 0); return; } if (vivid_is_sdtv_cap(dev) && VIVID_INVALID_SIGNAL(dev->std_signal_mode[dev->input])) { tpg_s_quality(&dev->tpg, TPG_QUAL_NOISE, 0); return; } if (!vivid_is_tv_cap(dev)) { tpg_s_quality(&dev->tpg, TPG_QUAL_COLOR, 0); return; } /* * There is a fake channel every 6 MHz at 49.25, 55.25, etc. * From +/- 0.25 MHz around the channel there is color, and from * +/- 1 MHz there is grayscale (chroma is lost). * Everywhere else it is just noise. */ freq_modulus = (dev->tv_freq - 676 /* (43.25-1) * 16 */) % (6 * 16); if (freq_modulus > 2 * 16) { tpg_s_quality(&dev->tpg, TPG_QUAL_NOISE, next_pseudo_random32(dev->tv_freq ^ 0x55) & 0x3f); return; } if (freq_modulus < 12 /*0.75 * 16*/ || freq_modulus > 20 /*1.25 * 16*/) tpg_s_quality(&dev->tpg, TPG_QUAL_GRAY, 0); else tpg_s_quality(&dev->tpg, TPG_QUAL_COLOR, 0); } /* * Get the current picture quality and the associated afc value. */ static enum tpg_quality vivid_get_quality(struct vivid_dev *dev, s32 *afc) { unsigned freq_modulus; if (afc) *afc = 0; if (tpg_g_quality(&dev->tpg) == TPG_QUAL_COLOR || tpg_g_quality(&dev->tpg) == TPG_QUAL_NOISE) return tpg_g_quality(&dev->tpg); /* * There is a fake channel every 6 MHz at 49.25, 55.25, etc. * From +/- 0.25 MHz around the channel there is color, and from * +/- 1 MHz there is grayscale (chroma is lost). * Everywhere else it is just gray. */ freq_modulus = (dev->tv_freq - 676 /* (43.25-1) * 16 */) % (6 * 16); if (afc) *afc = freq_modulus - 1 * 16; return TPG_QUAL_GRAY; } enum tpg_video_aspect vivid_get_video_aspect(const struct vivid_dev *dev) { if (vivid_is_sdtv_cap(dev)) return dev->std_aspect_ratio[dev->input]; if (vivid_is_hdmi_cap(dev)) return dev->dv_timings_aspect_ratio[dev->input]; return TPG_VIDEO_ASPECT_IMAGE; } static enum tpg_pixel_aspect vivid_get_pixel_aspect(const struct vivid_dev *dev) { if (vivid_is_sdtv_cap(dev)) return (dev->std_cap[dev->input] & V4L2_STD_525_60) ? TPG_PIXEL_ASPECT_NTSC : TPG_PIXEL_ASPECT_PAL; if (vivid_is_hdmi_cap(dev) && dev->src_rect.width == 720 && dev->src_rect.height <= 576) return dev->src_rect.height == 480 ? TPG_PIXEL_ASPECT_NTSC : TPG_PIXEL_ASPECT_PAL; return TPG_PIXEL_ASPECT_SQUARE; } /* * Called whenever the format has to be reset which can occur when * changing inputs, standard, timings, etc. */ void vivid_update_format_cap(struct vivid_dev *dev, bool keep_controls) { struct v4l2_bt_timings *bt = &dev->dv_timings_cap[dev->input].bt; u32 dims[V4L2_CTRL_MAX_DIMS] = {}; unsigned size; u64 pixelclock; switch (dev->input_type[dev->input]) { case WEBCAM: default: dev->src_rect.width = webcam_sizes[dev->webcam_size_idx].width; dev->src_rect.height = webcam_sizes[dev->webcam_size_idx].height; dev->timeperframe_vid_cap = webcam_intervals[dev->webcam_ival_idx]; dev->field_cap = V4L2_FIELD_NONE; tpg_s_rgb_range(&dev->tpg, V4L2_DV_RGB_RANGE_AUTO); break; case TV: case SVID: dev->field_cap = dev->tv_field_cap; dev->src_rect.width = 720; if (dev->std_cap[dev->input] & V4L2_STD_525_60) { dev->src_rect.height = 480; dev->timeperframe_vid_cap = (struct v4l2_fract) { 1001, 30000 }; dev->service_set_cap = V4L2_SLICED_CAPTION_525; } else { dev->src_rect.height = 576; dev->timeperframe_vid_cap = (struct v4l2_fract) { 1000, 25000 }; dev->service_set_cap = V4L2_SLICED_WSS_625 | V4L2_SLICED_TELETEXT_B; } tpg_s_rgb_range(&dev->tpg, V4L2_DV_RGB_RANGE_AUTO); break; case HDMI: dev->src_rect.width = bt->width; dev->src_rect.height = bt->height; size = V4L2_DV_BT_FRAME_WIDTH(bt) * V4L2_DV_BT_FRAME_HEIGHT(bt); if (dev->reduced_fps && can_reduce_fps(bt)) { pixelclock = div_u64(bt->pixelclock * 1000, 1001); bt->flags |= V4L2_DV_FL_REDUCED_FPS; } else { pixelclock = bt->pixelclock; bt->flags &= ~V4L2_DV_FL_REDUCED_FPS; } dev->timeperframe_vid_cap = (struct v4l2_fract) { size / 100, (u32)pixelclock / 100 }; if (bt->interlaced) dev->field_cap = V4L2_FIELD_ALTERNATE; else dev->field_cap = V4L2_FIELD_NONE; /* * We can be called from within s_ctrl, in that case we can't * set/get controls. Luckily we don't need to in that case. */ if (keep_controls || !dev->colorspace) break; if (bt->flags & V4L2_DV_FL_IS_CE_VIDEO) { if (bt->width == 720 && bt->height <= 576) v4l2_ctrl_s_ctrl(dev->colorspace, VIVID_CS_170M); else v4l2_ctrl_s_ctrl(dev->colorspace, VIVID_CS_709); v4l2_ctrl_s_ctrl(dev->real_rgb_range_cap, 1); } else { v4l2_ctrl_s_ctrl(dev->colorspace, VIVID_CS_SRGB); v4l2_ctrl_s_ctrl(dev->real_rgb_range_cap, 0); } tpg_s_rgb_range(&dev->tpg, v4l2_ctrl_g_ctrl(dev->rgb_range_cap)); break; } vivid_update_quality(dev); tpg_reset_source(&dev->tpg, dev->src_rect.width, dev->src_rect.height, dev->field_cap); dev->crop_cap = dev->src_rect; dev->crop_bounds_cap = dev->src_rect; dev->compose_cap = dev->crop_cap; if (V4L2_FIELD_HAS_T_OR_B(dev->field_cap)) dev->compose_cap.height /= 2; dev->fmt_cap_rect = dev->compose_cap; tpg_s_video_aspect(&dev->tpg, vivid_get_video_aspect(dev)); tpg_s_pixel_aspect(&dev->tpg, vivid_get_pixel_aspect(dev)); tpg_update_mv_step(&dev->tpg); /* * We can be called from within s_ctrl, in that case we can't * modify controls. Luckily we don't need to in that case. */ if (keep_controls) return; dims[0] = roundup(dev->src_rect.width, PIXEL_ARRAY_DIV); dims[1] = roundup(dev->src_rect.height, PIXEL_ARRAY_DIV); v4l2_ctrl_modify_dimensions(dev->pixel_array, dims); } /* Map the field to something that is valid for the current input */ static enum v4l2_field vivid_field_cap(struct vivid_dev *dev, enum v4l2_field field) { if (vivid_is_sdtv_cap(dev)) { switch (field) { case V4L2_FIELD_INTERLACED_TB: case V4L2_FIELD_INTERLACED_BT: case V4L2_FIELD_SEQ_TB: case V4L2_FIELD_SEQ_BT: case V4L2_FIELD_TOP: case V4L2_FIELD_BOTTOM: case V4L2_FIELD_ALTERNATE: return field; case V4L2_FIELD_INTERLACED: default: return V4L2_FIELD_INTERLACED; } } if (vivid_is_hdmi_cap(dev)) return dev->dv_timings_cap[dev->input].bt.interlaced ? V4L2_FIELD_ALTERNATE : V4L2_FIELD_NONE; return V4L2_FIELD_NONE; } static unsigned vivid_colorspace_cap(struct vivid_dev *dev) { if (!dev->loop_video || vivid_is_webcam(dev) || vivid_is_tv_cap(dev)) return tpg_g_colorspace(&dev->tpg); return dev->colorspace_out; } static unsigned vivid_xfer_func_cap(struct vivid_dev *dev) { if (!dev->loop_video || vivid_is_webcam(dev) || vivid_is_tv_cap(dev)) return tpg_g_xfer_func(&dev->tpg); return dev->xfer_func_out; } static unsigned vivid_ycbcr_enc_cap(struct vivid_dev *dev) { if (!dev->loop_video || vivid_is_webcam(dev) || vivid_is_tv_cap(dev)) return tpg_g_ycbcr_enc(&dev->tpg); return dev->ycbcr_enc_out; } static unsigned int vivid_hsv_enc_cap(struct vivid_dev *dev) { if (!dev->loop_video || vivid_is_webcam(dev) || vivid_is_tv_cap(dev)) return tpg_g_hsv_enc(&dev->tpg); return dev->hsv_enc_out; } static unsigned vivid_quantization_cap(struct vivid_dev *dev) { if (!dev->loop_video || vivid_is_webcam(dev) || vivid_is_tv_cap(dev)) return tpg_g_quantization(&dev->tpg); return dev->quantization_out; } int vivid_g_fmt_vid_cap(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_pix_format_mplane *mp = &f->fmt.pix_mp; unsigned p; mp->width = dev->fmt_cap_rect.width; mp->height = dev->fmt_cap_rect.height; mp->field = dev->field_cap; mp->pixelformat = dev->fmt_cap->fourcc; mp->colorspace = vivid_colorspace_cap(dev); mp->xfer_func = vivid_xfer_func_cap(dev); if (dev->fmt_cap->color_enc == TGP_COLOR_ENC_HSV) mp->hsv_enc = vivid_hsv_enc_cap(dev); else mp->ycbcr_enc = vivid_ycbcr_enc_cap(dev); mp->quantization = vivid_quantization_cap(dev); mp->num_planes = dev->fmt_cap->buffers; for (p = 0; p < mp->num_planes; p++) { mp->plane_fmt[p].bytesperline = tpg_g_bytesperline(&dev->tpg, p); mp->plane_fmt[p].sizeimage = (tpg_g_line_width(&dev->tpg, p) * mp->height) / dev->fmt_cap->vdownsampling[p] + dev->fmt_cap->data_offset[p]; } return 0; } int vivid_try_fmt_vid_cap(struct file *file, void *priv, struct v4l2_format *f) { struct v4l2_pix_format_mplane *mp = &f->fmt.pix_mp; struct v4l2_plane_pix_format *pfmt = mp->plane_fmt; struct vivid_dev *dev = video_drvdata(file); const struct vivid_fmt *fmt; unsigned bytesperline, max_bpl; unsigned factor = 1; unsigned w, h; unsigned p; bool user_set_csc = !!(mp->flags & V4L2_PIX_FMT_FLAG_SET_CSC); fmt = vivid_get_format(dev, mp->pixelformat); if (!fmt) { dprintk(dev, 1, "Fourcc format (0x%08x) unknown.\n", mp->pixelformat); mp->pixelformat = V4L2_PIX_FMT_YUYV; fmt = vivid_get_format(dev, mp->pixelformat); } mp->field = vivid_field_cap(dev, mp->field); if (vivid_is_webcam(dev)) { const struct v4l2_frmsize_discrete *sz = v4l2_find_nearest_size(webcam_sizes, ARRAY_SIZE(webcam_sizes), width, height, mp->width, mp->height); w = sz->width; h = sz->height; } else if (vivid_is_sdtv_cap(dev)) { w = 720; h = (dev->std_cap[dev->input] & V4L2_STD_525_60) ? 480 : 576; } else { w = dev->src_rect.width; h = dev->src_rect.height; } if (V4L2_FIELD_HAS_T_OR_B(mp->field)) factor = 2; if (vivid_is_webcam(dev) || (!dev->has_scaler_cap && !dev->has_crop_cap && !dev->has_compose_cap)) { mp->width = w; mp->height = h / factor; } else { struct v4l2_rect r = { 0, 0, mp->width, mp->height * factor }; v4l2_rect_set_min_size(&r, &vivid_min_rect); v4l2_rect_set_max_size(&r, &vivid_max_rect); if (dev->has_scaler_cap && !dev->has_compose_cap) { struct v4l2_rect max_r = { 0, 0, MAX_ZOOM * w, MAX_ZOOM * h }; v4l2_rect_set_max_size(&r, &max_r); } else if (!dev->has_scaler_cap && dev->has_crop_cap && !dev->has_compose_cap) { v4l2_rect_set_max_size(&r, &dev->src_rect); } else if (!dev->has_scaler_cap && !dev->has_crop_cap) { v4l2_rect_set_min_size(&r, &dev->src_rect); } mp->width = r.width; mp->height = r.height / factor; } /* This driver supports custom bytesperline values */ mp->num_planes = fmt->buffers; for (p = 0; p < fmt->buffers; p++) { /* Calculate the minimum supported bytesperline value */ bytesperline = (mp->width * fmt->bit_depth[p]) >> 3; /* Calculate the maximum supported bytesperline value */ max_bpl = (MAX_ZOOM * MAX_WIDTH * fmt->bit_depth[p]) >> 3; if (pfmt[p].bytesperline > max_bpl) pfmt[p].bytesperline = max_bpl; if (pfmt[p].bytesperline < bytesperline) pfmt[p].bytesperline = bytesperline; pfmt[p].sizeimage = (pfmt[p].bytesperline * mp->height) / fmt->vdownsampling[p] + fmt->data_offset[p]; memset(pfmt[p].reserved, 0, sizeof(pfmt[p].reserved)); } for (p = fmt->buffers; p < fmt->planes; p++) pfmt[0].sizeimage += (pfmt[0].bytesperline * mp->height * (fmt->bit_depth[p] / fmt->vdownsampling[p])) / (fmt->bit_depth[0] / fmt->vdownsampling[0]); if (!user_set_csc || !v4l2_is_colorspace_valid(mp->colorspace)) mp->colorspace = vivid_colorspace_cap(dev); if (!user_set_csc || !v4l2_is_xfer_func_valid(mp->xfer_func)) mp->xfer_func = vivid_xfer_func_cap(dev); if (fmt->color_enc == TGP_COLOR_ENC_HSV) { if (!user_set_csc || !v4l2_is_hsv_enc_valid(mp->hsv_enc)) mp->hsv_enc = vivid_hsv_enc_cap(dev); } else if (fmt->color_enc == TGP_COLOR_ENC_YCBCR) { if (!user_set_csc || !v4l2_is_ycbcr_enc_valid(mp->ycbcr_enc)) mp->ycbcr_enc = vivid_ycbcr_enc_cap(dev); } else { mp->ycbcr_enc = vivid_ycbcr_enc_cap(dev); } if (fmt->color_enc == TGP_COLOR_ENC_YCBCR || fmt->color_enc == TGP_COLOR_ENC_RGB) { if (!user_set_csc || !v4l2_is_quant_valid(mp->quantization)) mp->quantization = vivid_quantization_cap(dev); } else { mp->quantization = vivid_quantization_cap(dev); } memset(mp->reserved, 0, sizeof(mp->reserved)); return 0; } int vivid_s_fmt_vid_cap(struct file *file, void *priv, struct v4l2_format *f) { struct v4l2_pix_format_mplane *mp = &f->fmt.pix_mp; struct vivid_dev *dev = video_drvdata(file); struct v4l2_rect *crop = &dev->crop_cap; struct v4l2_rect *compose = &dev->compose_cap; struct vb2_queue *q = &dev->vb_vid_cap_q; int ret = vivid_try_fmt_vid_cap(file, priv, f); unsigned factor = 1; unsigned p; unsigned i; if (ret < 0) return ret; if (vb2_is_busy(q)) { dprintk(dev, 1, "%s device busy\n", __func__); return -EBUSY; } dev->fmt_cap = vivid_get_format(dev, mp->pixelformat); if (V4L2_FIELD_HAS_T_OR_B(mp->field)) factor = 2; /* Note: the webcam input doesn't support scaling, cropping or composing */ if (!vivid_is_webcam(dev) && (dev->has_scaler_cap || dev->has_crop_cap || dev->has_compose_cap)) { struct v4l2_rect r = { 0, 0, mp->width, mp->height }; if (dev->has_scaler_cap) { if (dev->has_compose_cap) v4l2_rect_map_inside(compose, &r); else *compose = r; if (dev->has_crop_cap && !dev->has_compose_cap) { struct v4l2_rect min_r = { 0, 0, r.width / MAX_ZOOM, factor * r.height / MAX_ZOOM }; struct v4l2_rect max_r = { 0, 0, r.width * MAX_ZOOM, factor * r.height * MAX_ZOOM }; v4l2_rect_set_min_size(crop, &min_r); v4l2_rect_set_max_size(crop, &max_r); v4l2_rect_map_inside(crop, &dev->crop_bounds_cap); } else if (dev->has_crop_cap) { struct v4l2_rect min_r = { 0, 0, compose->width / MAX_ZOOM, factor * compose->height / MAX_ZOOM }; struct v4l2_rect max_r = { 0, 0, compose->width * MAX_ZOOM, factor * compose->height * MAX_ZOOM }; v4l2_rect_set_min_size(crop, &min_r); v4l2_rect_set_max_size(crop, &max_r); v4l2_rect_map_inside(crop, &dev->crop_bounds_cap); } } else if (dev->has_crop_cap && !dev->has_compose_cap) { r.height *= factor; v4l2_rect_set_size_to(crop, &r); v4l2_rect_map_inside(crop, &dev->crop_bounds_cap); r = *crop; r.height /= factor; v4l2_rect_set_size_to(compose, &r); } else if (!dev->has_crop_cap) { v4l2_rect_map_inside(compose, &r); } else { r.height *= factor; v4l2_rect_set_max_size(crop, &r); v4l2_rect_map_inside(crop, &dev->crop_bounds_cap); compose->top *= factor; compose->height *= factor; v4l2_rect_set_size_to(compose, crop); v4l2_rect_map_inside(compose, &r); compose->top /= factor; compose->height /= factor; } } else if (vivid_is_webcam(dev)) { unsigned int ival_sz = webcam_ival_count(dev, dev->webcam_size_idx); /* Guaranteed to be a match */ for (i = 0; i < ARRAY_SIZE(webcam_sizes); i++) if (webcam_sizes[i].width == mp->width && webcam_sizes[i].height == mp->height) break; dev->webcam_size_idx = i; if (dev->webcam_ival_idx >= ival_sz) dev->webcam_ival_idx = ival_sz - 1; vivid_update_format_cap(dev, false); } else { struct v4l2_rect r = { 0, 0, mp->width, mp->height }; v4l2_rect_set_size_to(compose, &r); r.height *= factor; v4l2_rect_set_size_to(crop, &r); } dev->fmt_cap_rect.width = mp->width; dev->fmt_cap_rect.height = mp->height; tpg_s_buf_height(&dev->tpg, mp->height); tpg_s_fourcc(&dev->tpg, dev->fmt_cap->fourcc); for (p = 0; p < tpg_g_buffers(&dev->tpg); p++) tpg_s_bytesperline(&dev->tpg, p, mp->plane_fmt[p].bytesperline); dev->field_cap = mp->field; if (dev->field_cap == V4L2_FIELD_ALTERNATE) tpg_s_field(&dev->tpg, V4L2_FIELD_TOP, true); else tpg_s_field(&dev->tpg, dev->field_cap, false); tpg_s_crop_compose(&dev->tpg, &dev->crop_cap, &dev->compose_cap); if (vivid_is_sdtv_cap(dev)) dev->tv_field_cap = mp->field; tpg_update_mv_step(&dev->tpg); dev->tpg.colorspace = mp->colorspace; dev->tpg.xfer_func = mp->xfer_func; if (dev->fmt_cap->color_enc == TGP_COLOR_ENC_YCBCR) dev->tpg.ycbcr_enc = mp->ycbcr_enc; else dev->tpg.hsv_enc = mp->hsv_enc; dev->tpg.quantization = mp->quantization; return 0; } int vidioc_g_fmt_vid_cap_mplane(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (!dev->multiplanar) return -ENOTTY; return vivid_g_fmt_vid_cap(file, priv, f); } int vidioc_try_fmt_vid_cap_mplane(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (!dev->multiplanar) return -ENOTTY; return vivid_try_fmt_vid_cap(file, priv, f); } int vidioc_s_fmt_vid_cap_mplane(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (!dev->multiplanar) return -ENOTTY; return vivid_s_fmt_vid_cap(file, priv, f); } int vidioc_g_fmt_vid_cap(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (dev->multiplanar) return -ENOTTY; return fmt_sp2mp_func(file, priv, f, vivid_g_fmt_vid_cap); } int vidioc_try_fmt_vid_cap(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (dev->multiplanar) return -ENOTTY; return fmt_sp2mp_func(file, priv, f, vivid_try_fmt_vid_cap); } int vidioc_s_fmt_vid_cap(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (dev->multiplanar) return -ENOTTY; return fmt_sp2mp_func(file, priv, f, vivid_s_fmt_vid_cap); } int vivid_vid_cap_g_selection(struct file *file, void *priv, struct v4l2_selection *sel) { struct vivid_dev *dev = video_drvdata(file); if (!dev->has_crop_cap && !dev->has_compose_cap) return -ENOTTY; if (sel->type != V4L2_BUF_TYPE_VIDEO_CAPTURE) return -EINVAL; if (vivid_is_webcam(dev)) return -ENODATA; sel->r.left = sel->r.top = 0; switch (sel->target) { case V4L2_SEL_TGT_CROP: if (!dev->has_crop_cap) return -EINVAL; sel->r = dev->crop_cap; break; case V4L2_SEL_TGT_CROP_DEFAULT: case V4L2_SEL_TGT_CROP_BOUNDS: if (!dev->has_crop_cap) return -EINVAL; sel->r = dev->src_rect; break; case V4L2_SEL_TGT_COMPOSE_BOUNDS: if (!dev->has_compose_cap) return -EINVAL; sel->r = vivid_max_rect; break; case V4L2_SEL_TGT_COMPOSE: if (!dev->has_compose_cap) return -EINVAL; sel->r = dev->compose_cap; break; case V4L2_SEL_TGT_COMPOSE_DEFAULT: if (!dev->has_compose_cap) return -EINVAL; sel->r = dev->fmt_cap_rect; break; default: return -EINVAL; } return 0; } int vivid_vid_cap_s_selection(struct file *file, void *fh, struct v4l2_selection *s) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_rect *crop = &dev->crop_cap; struct v4l2_rect *compose = &dev->compose_cap; unsigned factor = V4L2_FIELD_HAS_T_OR_B(dev->field_cap) ? 2 : 1; int ret; if (!dev->has_crop_cap && !dev->has_compose_cap) return -ENOTTY; if (s->type != V4L2_BUF_TYPE_VIDEO_CAPTURE) return -EINVAL; if (vivid_is_webcam(dev)) return -ENODATA; switch (s->target) { case V4L2_SEL_TGT_CROP: if (!dev->has_crop_cap) return -EINVAL; ret = vivid_vid_adjust_sel(s->flags, &s->r); if (ret) return ret; v4l2_rect_set_min_size(&s->r, &vivid_min_rect); v4l2_rect_set_max_size(&s->r, &dev->src_rect); v4l2_rect_map_inside(&s->r, &dev->crop_bounds_cap); s->r.top /= factor; s->r.height /= factor; if (dev->has_scaler_cap) { struct v4l2_rect fmt = dev->fmt_cap_rect; struct v4l2_rect max_rect = { 0, 0, s->r.width * MAX_ZOOM, s->r.height * MAX_ZOOM }; struct v4l2_rect min_rect = { 0, 0, s->r.width / MAX_ZOOM, s->r.height / MAX_ZOOM }; v4l2_rect_set_min_size(&fmt, &min_rect); if (!dev->has_compose_cap) v4l2_rect_set_max_size(&fmt, &max_rect); if (!v4l2_rect_same_size(&dev->fmt_cap_rect, &fmt) && vb2_is_busy(&dev->vb_vid_cap_q)) return -EBUSY; if (dev->has_compose_cap) { v4l2_rect_set_min_size(compose, &min_rect); v4l2_rect_set_max_size(compose, &max_rect); v4l2_rect_map_inside(compose, &fmt); } dev->fmt_cap_rect = fmt; tpg_s_buf_height(&dev->tpg, fmt.height); } else if (dev->has_compose_cap) { struct v4l2_rect fmt = dev->fmt_cap_rect; v4l2_rect_set_min_size(&fmt, &s->r); if (!v4l2_rect_same_size(&dev->fmt_cap_rect, &fmt) && vb2_is_busy(&dev->vb_vid_cap_q)) return -EBUSY; dev->fmt_cap_rect = fmt; tpg_s_buf_height(&dev->tpg, fmt.height); v4l2_rect_set_size_to(compose, &s->r); v4l2_rect_map_inside(compose, &dev->fmt_cap_rect); } else { if (!v4l2_rect_same_size(&s->r, &dev->fmt_cap_rect) && vb2_is_busy(&dev->vb_vid_cap_q)) return -EBUSY; v4l2_rect_set_size_to(&dev->fmt_cap_rect, &s->r); v4l2_rect_set_size_to(compose, &s->r); v4l2_rect_map_inside(compose, &dev->fmt_cap_rect); tpg_s_buf_height(&dev->tpg, dev->fmt_cap_rect.height); } s->r.top *= factor; s->r.height *= factor; *crop = s->r; break; case V4L2_SEL_TGT_COMPOSE: if (!dev->has_compose_cap) return -EINVAL; ret = vivid_vid_adjust_sel(s->flags, &s->r); if (ret) return ret; v4l2_rect_set_min_size(&s->r, &vivid_min_rect); v4l2_rect_set_max_size(&s->r, &dev->fmt_cap_rect); if (dev->has_scaler_cap) { struct v4l2_rect max_rect = { 0, 0, dev->src_rect.width * MAX_ZOOM, (dev->src_rect.height / factor) * MAX_ZOOM }; v4l2_rect_set_max_size(&s->r, &max_rect); if (dev->has_crop_cap) { struct v4l2_rect min_rect = { 0, 0, s->r.width / MAX_ZOOM, (s->r.height * factor) / MAX_ZOOM }; struct v4l2_rect max_rect = { 0, 0, s->r.width * MAX_ZOOM, (s->r.height * factor) * MAX_ZOOM }; v4l2_rect_set_min_size(crop, &min_rect); v4l2_rect_set_max_size(crop, &max_rect); v4l2_rect_map_inside(crop, &dev->crop_bounds_cap); } } else if (dev->has_crop_cap) { s->r.top *= factor; s->r.height *= factor; v4l2_rect_set_max_size(&s->r, &dev->src_rect); v4l2_rect_set_size_to(crop, &s->r); v4l2_rect_map_inside(crop, &dev->crop_bounds_cap); s->r.top /= factor; s->r.height /= factor; } else { v4l2_rect_set_size_to(&s->r, &dev->src_rect); s->r.height /= factor; } v4l2_rect_map_inside(&s->r, &dev->fmt_cap_rect); *compose = s->r; break; default: return -EINVAL; } tpg_s_crop_compose(&dev->tpg, crop, compose); return 0; } int vivid_vid_cap_g_pixelaspect(struct file *file, void *priv, int type, struct v4l2_fract *f) { struct vivid_dev *dev = video_drvdata(file); if (type != V4L2_BUF_TYPE_VIDEO_CAPTURE) return -EINVAL; switch (vivid_get_pixel_aspect(dev)) { case TPG_PIXEL_ASPECT_NTSC: f->numerator = 11; f->denominator = 10; break; case TPG_PIXEL_ASPECT_PAL: f->numerator = 54; f->denominator = 59; break; default: break; } return 0; } static const struct v4l2_audio vivid_audio_inputs[] = { { 0, "TV", V4L2_AUDCAP_STEREO }, { 1, "Line-In", V4L2_AUDCAP_STEREO }, }; int vidioc_enum_input(struct file *file, void *priv, struct v4l2_input *inp) { struct vivid_dev *dev = video_drvdata(file); if (inp->index >= dev->num_inputs) return -EINVAL; inp->type = V4L2_INPUT_TYPE_CAMERA; switch (dev->input_type[inp->index]) { case WEBCAM: snprintf(inp->name, sizeof(inp->name), "Webcam %u", dev->input_name_counter[inp->index]); inp->capabilities = 0; break; case TV: snprintf(inp->name, sizeof(inp->name), "TV %u", dev->input_name_counter[inp->index]); inp->type = V4L2_INPUT_TYPE_TUNER; inp->std = V4L2_STD_ALL; if (dev->has_audio_inputs) inp->audioset = (1 << ARRAY_SIZE(vivid_audio_inputs)) - 1; inp->capabilities = V4L2_IN_CAP_STD; break; case SVID: snprintf(inp->name, sizeof(inp->name), "S-Video %u", dev->input_name_counter[inp->index]); inp->std = V4L2_STD_ALL; if (dev->has_audio_inputs) inp->audioset = (1 << ARRAY_SIZE(vivid_audio_inputs)) - 1; inp->capabilities = V4L2_IN_CAP_STD; break; case HDMI: snprintf(inp->name, sizeof(inp->name), "HDMI %u", dev->input_name_counter[inp->index]); inp->capabilities = V4L2_IN_CAP_DV_TIMINGS; if (dev->edid_blocks == 0 || dev->dv_timings_signal_mode[dev->input] == NO_SIGNAL) inp->status |= V4L2_IN_ST_NO_SIGNAL; else if (dev->dv_timings_signal_mode[dev->input] == NO_LOCK || dev->dv_timings_signal_mode[dev->input] == OUT_OF_RANGE) inp->status |= V4L2_IN_ST_NO_H_LOCK; break; } if (dev->sensor_hflip) inp->status |= V4L2_IN_ST_HFLIP; if (dev->sensor_vflip) inp->status |= V4L2_IN_ST_VFLIP; if (dev->input == inp->index && vivid_is_sdtv_cap(dev)) { if (dev->std_signal_mode[dev->input] == NO_SIGNAL) { inp->status |= V4L2_IN_ST_NO_SIGNAL; } else if (dev->std_signal_mode[dev->input] == NO_LOCK) { inp->status |= V4L2_IN_ST_NO_H_LOCK; } else if (vivid_is_tv_cap(dev)) { switch (tpg_g_quality(&dev->tpg)) { case TPG_QUAL_GRAY: inp->status |= V4L2_IN_ST_COLOR_KILL; break; case TPG_QUAL_NOISE: inp->status |= V4L2_IN_ST_NO_H_LOCK; break; default: break; } } } return 0; } int vidioc_g_input(struct file *file, void *priv, unsigned *i) { struct vivid_dev *dev = video_drvdata(file); *i = dev->input; return 0; } int vidioc_s_input(struct file *file, void *priv, unsigned i) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_bt_timings *bt = &dev->dv_timings_cap[dev->input].bt; unsigned brightness; if (i >= dev->num_inputs) return -EINVAL; if (i == dev->input) return 0; if (vb2_is_busy(&dev->vb_vid_cap_q) || vb2_is_busy(&dev->vb_vbi_cap_q) || vb2_is_busy(&dev->vb_meta_cap_q)) return -EBUSY; dev->input = i; dev->vid_cap_dev.tvnorms = 0; if (dev->input_type[i] == TV || dev->input_type[i] == SVID) { dev->tv_audio_input = (dev->input_type[i] == TV) ? 0 : 1; dev->vid_cap_dev.tvnorms = V4L2_STD_ALL; } dev->vbi_cap_dev.tvnorms = dev->vid_cap_dev.tvnorms; dev->meta_cap_dev.tvnorms = dev->vid_cap_dev.tvnorms; vivid_update_format_cap(dev, false); if (dev->colorspace) { switch (dev->input_type[i]) { case WEBCAM: v4l2_ctrl_s_ctrl(dev->colorspace, VIVID_CS_SRGB); break; case TV: case SVID: v4l2_ctrl_s_ctrl(dev->colorspace, VIVID_CS_170M); break; case HDMI: if (bt->flags & V4L2_DV_FL_IS_CE_VIDEO) { if (dev->src_rect.width == 720 && dev->src_rect.height <= 576) v4l2_ctrl_s_ctrl(dev->colorspace, VIVID_CS_170M); else v4l2_ctrl_s_ctrl(dev->colorspace, VIVID_CS_709); } else { v4l2_ctrl_s_ctrl(dev->colorspace, VIVID_CS_SRGB); } break; } } /* * Modify the brightness range depending on the input. * This makes it easy to use vivid to test if applications can * handle control range modifications and is also how this is * typically used in practice as different inputs may be hooked * up to different receivers with different control ranges. */ brightness = 128 * i + dev->input_brightness[i]; v4l2_ctrl_modify_range(dev->brightness, 128 * i, 255 + 128 * i, 1, 128 + 128 * i); v4l2_ctrl_s_ctrl(dev->brightness, brightness); /* Restore per-input states. */ v4l2_ctrl_activate(dev->ctrl_dv_timings_signal_mode, vivid_is_hdmi_cap(dev)); v4l2_ctrl_activate(dev->ctrl_dv_timings, vivid_is_hdmi_cap(dev) && dev->dv_timings_signal_mode[dev->input] == SELECTED_DV_TIMINGS); v4l2_ctrl_activate(dev->ctrl_std_signal_mode, vivid_is_sdtv_cap(dev)); v4l2_ctrl_activate(dev->ctrl_standard, vivid_is_sdtv_cap(dev) && dev->std_signal_mode[dev->input]); if (vivid_is_hdmi_cap(dev)) { v4l2_ctrl_s_ctrl(dev->ctrl_dv_timings_signal_mode, dev->dv_timings_signal_mode[dev->input]); v4l2_ctrl_s_ctrl(dev->ctrl_dv_timings, dev->query_dv_timings[dev->input]); } else if (vivid_is_sdtv_cap(dev)) { v4l2_ctrl_s_ctrl(dev->ctrl_std_signal_mode, dev->std_signal_mode[dev->input]); v4l2_ctrl_s_ctrl(dev->ctrl_standard, dev->std_signal_mode[dev->input]); } return 0; } int vidioc_enumaudio(struct file *file, void *fh, struct v4l2_audio *vin) { if (vin->index >= ARRAY_SIZE(vivid_audio_inputs)) return -EINVAL; *vin = vivid_audio_inputs[vin->index]; return 0; } int vidioc_g_audio(struct file *file, void *fh, struct v4l2_audio *vin) { struct vivid_dev *dev = video_drvdata(file); if (!vivid_is_sdtv_cap(dev)) return -EINVAL; *vin = vivid_audio_inputs[dev->tv_audio_input]; return 0; } int vidioc_s_audio(struct file *file, void *fh, const struct v4l2_audio *vin) { struct vivid_dev *dev = video_drvdata(file); if (!vivid_is_sdtv_cap(dev)) return -EINVAL; if (vin->index >= ARRAY_SIZE(vivid_audio_inputs)) return -EINVAL; dev->tv_audio_input = vin->index; return 0; } int vivid_video_g_frequency(struct file *file, void *fh, struct v4l2_frequency *vf) { struct vivid_dev *dev = video_drvdata(file); if (vf->tuner != 0) return -EINVAL; vf->frequency = dev->tv_freq; return 0; } int vivid_video_s_frequency(struct file *file, void *fh, const struct v4l2_frequency *vf) { struct vivid_dev *dev = video_drvdata(file); if (vf->tuner != 0) return -EINVAL; dev->tv_freq = clamp_t(unsigned, vf->frequency, MIN_TV_FREQ, MAX_TV_FREQ); if (vivid_is_tv_cap(dev)) vivid_update_quality(dev); return 0; } int vivid_video_s_tuner(struct file *file, void *fh, const struct v4l2_tuner *vt) { struct vivid_dev *dev = video_drvdata(file); if (vt->index != 0) return -EINVAL; if (vt->audmode > V4L2_TUNER_MODE_LANG1_LANG2) return -EINVAL; dev->tv_audmode = vt->audmode; return 0; } int vivid_video_g_tuner(struct file *file, void *fh, struct v4l2_tuner *vt) { struct vivid_dev *dev = video_drvdata(file); enum tpg_quality qual; if (vt->index != 0) return -EINVAL; vt->capability = V4L2_TUNER_CAP_NORM | V4L2_TUNER_CAP_STEREO | V4L2_TUNER_CAP_LANG1 | V4L2_TUNER_CAP_LANG2; vt->audmode = dev->tv_audmode; vt->rangelow = MIN_TV_FREQ; vt->rangehigh = MAX_TV_FREQ; qual = vivid_get_quality(dev, &vt->afc); if (qual == TPG_QUAL_COLOR) vt->signal = 0xffff; else if (qual == TPG_QUAL_GRAY) vt->signal = 0x8000; else vt->signal = 0; if (qual == TPG_QUAL_NOISE) { vt->rxsubchans = 0; } else if (qual == TPG_QUAL_GRAY) { vt->rxsubchans = V4L2_TUNER_SUB_MONO; } else { unsigned int channel_nr = dev->tv_freq / (6 * 16); unsigned int options = (dev->std_cap[dev->input] & V4L2_STD_NTSC_M) ? 4 : 3; switch (channel_nr % options) { case 0: vt->rxsubchans = V4L2_TUNER_SUB_MONO; break; case 1: vt->rxsubchans = V4L2_TUNER_SUB_STEREO; break; case 2: if (dev->std_cap[dev->input] & V4L2_STD_NTSC_M) vt->rxsubchans = V4L2_TUNER_SUB_MONO | V4L2_TUNER_SUB_SAP; else vt->rxsubchans = V4L2_TUNER_SUB_LANG1 | V4L2_TUNER_SUB_LANG2; break; case 3: vt->rxsubchans = V4L2_TUNER_SUB_STEREO | V4L2_TUNER_SUB_SAP; break; } } strscpy(vt->name, "TV Tuner", sizeof(vt->name)); return 0; } /* Must remain in sync with the vivid_ctrl_standard_strings array */ const v4l2_std_id vivid_standard[] = { V4L2_STD_NTSC_M, V4L2_STD_NTSC_M_JP, V4L2_STD_NTSC_M_KR, V4L2_STD_NTSC_443, V4L2_STD_PAL_BG | V4L2_STD_PAL_H, V4L2_STD_PAL_I, V4L2_STD_PAL_DK, V4L2_STD_PAL_M, V4L2_STD_PAL_N, V4L2_STD_PAL_Nc, V4L2_STD_PAL_60, V4L2_STD_SECAM_B | V4L2_STD_SECAM_G | V4L2_STD_SECAM_H, V4L2_STD_SECAM_DK, V4L2_STD_SECAM_L, V4L2_STD_SECAM_LC, V4L2_STD_UNKNOWN }; /* Must remain in sync with the vivid_standard array */ const char * const vivid_ctrl_standard_strings[] = { "NTSC-M", "NTSC-M-JP", "NTSC-M-KR", "NTSC-443", "PAL-BGH", "PAL-I", "PAL-DK", "PAL-M", "PAL-N", "PAL-Nc", "PAL-60", "SECAM-BGH", "SECAM-DK", "SECAM-L", "SECAM-Lc", NULL, }; int vidioc_querystd(struct file *file, void *priv, v4l2_std_id *id) { struct vivid_dev *dev = video_drvdata(file); unsigned int last = dev->query_std_last[dev->input]; if (!vivid_is_sdtv_cap(dev)) return -ENODATA; if (dev->std_signal_mode[dev->input] == NO_SIGNAL || dev->std_signal_mode[dev->input] == NO_LOCK) { *id = V4L2_STD_UNKNOWN; return 0; } if (vivid_is_tv_cap(dev) && tpg_g_quality(&dev->tpg) == TPG_QUAL_NOISE) { *id = V4L2_STD_UNKNOWN; } else if (dev->std_signal_mode[dev->input] == CURRENT_STD) { *id = dev->std_cap[dev->input]; } else if (dev->std_signal_mode[dev->input] == SELECTED_STD) { *id = dev->query_std[dev->input]; } else { *id = vivid_standard[last]; dev->query_std_last[dev->input] = (last + 1) % ARRAY_SIZE(vivid_standard); } return 0; } int vivid_vid_cap_s_std(struct file *file, void *priv, v4l2_std_id id) { struct vivid_dev *dev = video_drvdata(file); if (!vivid_is_sdtv_cap(dev)) return -ENODATA; if (dev->std_cap[dev->input] == id) return 0; if (vb2_is_busy(&dev->vb_vid_cap_q) || vb2_is_busy(&dev->vb_vbi_cap_q)) return -EBUSY; dev->std_cap[dev->input] = id; vivid_update_format_cap(dev, false); return 0; } static void find_aspect_ratio(u32 width, u32 height, u32 *num, u32 *denom) { if (!(height % 3) && ((height * 4 / 3) == width)) { *num = 4; *denom = 3; } else if (!(height % 9) && ((height * 16 / 9) == width)) { *num = 16; *denom = 9; } else if (!(height % 10) && ((height * 16 / 10) == width)) { *num = 16; *denom = 10; } else if (!(height % 4) && ((height * 5 / 4) == width)) { *num = 5; *denom = 4; } else if (!(height % 9) && ((height * 15 / 9) == width)) { *num = 15; *denom = 9; } else { /* default to 16:9 */ *num = 16; *denom = 9; } } static bool valid_cvt_gtf_timings(struct v4l2_dv_timings *timings) { struct v4l2_bt_timings *bt = &timings->bt; u32 total_h_pixel; u32 total_v_lines; u32 h_freq; if (!v4l2_valid_dv_timings(timings, &vivid_dv_timings_cap, NULL, NULL)) return false; total_h_pixel = V4L2_DV_BT_FRAME_WIDTH(bt); total_v_lines = V4L2_DV_BT_FRAME_HEIGHT(bt); h_freq = (u32)bt->pixelclock / total_h_pixel; if (bt->standards == 0 || (bt->standards & V4L2_DV_BT_STD_CVT)) { if (v4l2_detect_cvt(total_v_lines, h_freq, bt->vsync, bt->width, bt->polarities, bt->interlaced, timings)) return true; } if (bt->standards == 0 || (bt->standards & V4L2_DV_BT_STD_GTF)) { struct v4l2_fract aspect_ratio; find_aspect_ratio(bt->width, bt->height, &aspect_ratio.numerator, &aspect_ratio.denominator); if (v4l2_detect_gtf(total_v_lines, h_freq, bt->vsync, bt->polarities, bt->interlaced, aspect_ratio, timings)) return true; } return false; } int vivid_vid_cap_s_dv_timings(struct file *file, void *_fh, struct v4l2_dv_timings *timings) { struct vivid_dev *dev = video_drvdata(file); if (!vivid_is_hdmi_cap(dev)) return -ENODATA; if (!v4l2_find_dv_timings_cap(timings, &vivid_dv_timings_cap, 0, NULL, NULL) && !valid_cvt_gtf_timings(timings)) return -EINVAL; if (v4l2_match_dv_timings(timings, &dev->dv_timings_cap[dev->input], 0, false)) return 0; if (vb2_is_busy(&dev->vb_vid_cap_q)) return -EBUSY; dev->dv_timings_cap[dev->input] = *timings; vivid_update_format_cap(dev, false); return 0; } int vidioc_query_dv_timings(struct file *file, void *_fh, struct v4l2_dv_timings *timings) { struct vivid_dev *dev = video_drvdata(file); unsigned int input = dev->input; unsigned int last = dev->query_dv_timings_last[input]; if (!vivid_is_hdmi_cap(dev)) return -ENODATA; if (dev->dv_timings_signal_mode[input] == NO_SIGNAL || dev->edid_blocks == 0) return -ENOLINK; if (dev->dv_timings_signal_mode[input] == NO_LOCK) return -ENOLCK; if (dev->dv_timings_signal_mode[input] == OUT_OF_RANGE) { timings->bt.pixelclock = vivid_dv_timings_cap.bt.max_pixelclock * 2; return -ERANGE; } if (dev->dv_timings_signal_mode[input] == CURRENT_DV_TIMINGS) { *timings = dev->dv_timings_cap[input]; } else if (dev->dv_timings_signal_mode[input] == SELECTED_DV_TIMINGS) { *timings = v4l2_dv_timings_presets[dev->query_dv_timings[input]]; } else { *timings = v4l2_dv_timings_presets[last]; dev->query_dv_timings_last[input] = (last + 1) % dev->query_dv_timings_size; } return 0; } int vidioc_s_edid(struct file *file, void *_fh, struct v4l2_edid *edid) { struct vivid_dev *dev = video_drvdata(file); u16 phys_addr; u32 display_present = 0; unsigned int i, j; int ret; memset(edid->reserved, 0, sizeof(edid->reserved)); if (edid->pad >= dev->num_inputs) return -EINVAL; if (dev->input_type[edid->pad] != HDMI || edid->start_block) return -EINVAL; if (edid->blocks == 0) { dev->edid_blocks = 0; v4l2_ctrl_s_ctrl(dev->ctrl_tx_edid_present, 0); v4l2_ctrl_s_ctrl(dev->ctrl_tx_hotplug, 0); phys_addr = CEC_PHYS_ADDR_INVALID; goto set_phys_addr; } if (edid->blocks > dev->edid_max_blocks) { edid->blocks = dev->edid_max_blocks; return -E2BIG; } phys_addr = cec_get_edid_phys_addr(edid->edid, edid->blocks * 128, NULL); ret = v4l2_phys_addr_validate(phys_addr, &phys_addr, NULL); if (ret) return ret; if (vb2_is_busy(&dev->vb_vid_cap_q)) return -EBUSY; dev->edid_blocks = edid->blocks; memcpy(dev->edid, edid->edid, edid->blocks * 128); for (i = 0, j = 0; i < dev->num_outputs; i++) if (dev->output_type[i] == HDMI) display_present |= dev->display_present[i] << j++; v4l2_ctrl_s_ctrl(dev->ctrl_tx_edid_present, display_present); v4l2_ctrl_s_ctrl(dev->ctrl_tx_hotplug, display_present); set_phys_addr: /* TODO: a proper hotplug detect cycle should be emulated here */ cec_s_phys_addr(dev->cec_rx_adap, phys_addr, false); for (i = 0; i < MAX_OUTPUTS && dev->cec_tx_adap[i]; i++) cec_s_phys_addr(dev->cec_tx_adap[i], dev->display_present[i] ? v4l2_phys_addr_for_input(phys_addr, i + 1) : CEC_PHYS_ADDR_INVALID, false); return 0; } int vidioc_enum_framesizes(struct file *file, void *fh, struct v4l2_frmsizeenum *fsize) { struct vivid_dev *dev = video_drvdata(file); if (!vivid_is_webcam(dev) && !dev->has_scaler_cap) return -EINVAL; if (vivid_get_format(dev, fsize->pixel_format) == NULL) return -EINVAL; if (vivid_is_webcam(dev)) { if (fsize->index >= ARRAY_SIZE(webcam_sizes)) return -EINVAL; fsize->type = V4L2_FRMSIZE_TYPE_DISCRETE; fsize->discrete = webcam_sizes[fsize->index]; return 0; } if (fsize->index) return -EINVAL; fsize->type = V4L2_FRMSIZE_TYPE_STEPWISE; fsize->stepwise.min_width = MIN_WIDTH; fsize->stepwise.max_width = MAX_WIDTH * MAX_ZOOM; fsize->stepwise.step_width = 2; fsize->stepwise.min_height = MIN_HEIGHT; fsize->stepwise.max_height = MAX_HEIGHT * MAX_ZOOM; fsize->stepwise.step_height = 2; return 0; } /* timeperframe is arbitrary and continuous */ int vidioc_enum_frameintervals(struct file *file, void *priv, struct v4l2_frmivalenum *fival) { struct vivid_dev *dev = video_drvdata(file); const struct vivid_fmt *fmt; int i; fmt = vivid_get_format(dev, fival->pixel_format); if (!fmt) return -EINVAL; if (!vivid_is_webcam(dev)) { if (fival->index) return -EINVAL; if (fival->width < MIN_WIDTH || fival->width > MAX_WIDTH * MAX_ZOOM) return -EINVAL; if (fival->height < MIN_HEIGHT || fival->height > MAX_HEIGHT * MAX_ZOOM) return -EINVAL; fival->type = V4L2_FRMIVAL_TYPE_DISCRETE; fival->discrete = dev->timeperframe_vid_cap; return 0; } for (i = 0; i < ARRAY_SIZE(webcam_sizes); i++) if (fival->width == webcam_sizes[i].width && fival->height == webcam_sizes[i].height) break; if (i == ARRAY_SIZE(webcam_sizes)) return -EINVAL; if (fival->index >= webcam_ival_count(dev, i)) return -EINVAL; fival->type = V4L2_FRMIVAL_TYPE_DISCRETE; fival->discrete = webcam_intervals[fival->index]; return 0; } int vivid_vid_cap_g_parm(struct file *file, void *priv, struct v4l2_streamparm *parm) { struct vivid_dev *dev = video_drvdata(file); if (parm->type != (dev->multiplanar ? V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE : V4L2_BUF_TYPE_VIDEO_CAPTURE)) return -EINVAL; parm->parm.capture.capability = V4L2_CAP_TIMEPERFRAME; parm->parm.capture.timeperframe = dev->timeperframe_vid_cap; parm->parm.capture.readbuffers = 1; return 0; } int vivid_vid_cap_s_parm(struct file *file, void *priv, struct v4l2_streamparm *parm) { struct vivid_dev *dev = video_drvdata(file); unsigned int ival_sz = webcam_ival_count(dev, dev->webcam_size_idx); struct v4l2_fract tpf; unsigned i; if (parm->type != (dev->multiplanar ? V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE : V4L2_BUF_TYPE_VIDEO_CAPTURE)) return -EINVAL; if (!vivid_is_webcam(dev)) return vivid_vid_cap_g_parm(file, priv, parm); tpf = parm->parm.capture.timeperframe; if (tpf.denominator == 0) tpf = webcam_intervals[ival_sz - 1]; for (i = 0; i < ival_sz; i++) if (V4L2_FRACT_COMPARE(tpf, >=, webcam_intervals[i])) break; if (i == ival_sz) i = ival_sz - 1; dev->webcam_ival_idx = i; tpf = webcam_intervals[dev->webcam_ival_idx]; /* resync the thread's timings */ dev->cap_seq_resync = true; dev->timeperframe_vid_cap = tpf; parm->parm.capture.capability = V4L2_CAP_TIMEPERFRAME; parm->parm.capture.timeperframe = tpf; parm->parm.capture.readbuffers = 1; return 0; }
433 14 15 412 440 172 351 18 21 22 415 113 8 105 5 104 114 1 94 5 3 2 3 2 2 2 407 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> * (C) 2006-2012 Patrick McHardy <kaber@trash.net> */ #include <linux/types.h> #include <linux/timer.h> #include <linux/module.h> #include <linux/udp.h> #include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/ipv6.h> #include <net/ip6_checksum.h> #include <net/checksum.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_log.h> #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> static const unsigned int udp_timeouts[UDP_CT_MAX] = { [UDP_CT_UNREPLIED] = 30*HZ, [UDP_CT_REPLIED] = 120*HZ, }; static unsigned int *udp_get_timeouts(struct net *net) { return nf_udp_pernet(net)->timeouts; } static void udp_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { nf_l4proto_log_invalid(skb, state, IPPROTO_UDP, "%s", msg); } static bool udp_error(struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { unsigned int udplen = skb->len - dataoff; const struct udphdr *hdr; struct udphdr _hdr; /* Header is too small? */ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (!hdr) { udp_error_log(skb, state, "short packet"); return true; } /* Truncated/malformed packets */ if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { udp_error_log(skb, state, "truncated/malformed packet"); return true; } /* Packet with no checksum */ if (!hdr->check) return false; /* Checksum invalid? Ignore. * We skip checking packets on the outgoing path * because the checksum is assumed to be correct. * FIXME: Source route IP option packets --RR */ if (state->hook == NF_INET_PRE_ROUTING && state->net->ct.sysctl_checksum && nf_checksum(skb, state->hook, dataoff, IPPROTO_UDP, state->pf)) { udp_error_log(skb, state, "bad checksum"); return true; } return false; } /* Returns verdict for packet, and may modify conntracktype */ int nf_conntrack_udp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { unsigned int *timeouts; unsigned long status; if (udp_error(skb, dataoff, state)) return -NF_ACCEPT; timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = udp_get_timeouts(nf_ct_net(ct)); status = READ_ONCE(ct->status); if ((status & IPS_CONFIRMED) == 0) ct->proto.udp.stream_ts = 2 * HZ + jiffies; /* If we've seen traffic both ways, this is some kind of UDP * stream. Set Assured. */ if (status & IPS_SEEN_REPLY) { unsigned long extra = timeouts[UDP_CT_UNREPLIED]; bool stream = false; /* Still active after two seconds? Extend timeout. */ if (time_after(jiffies, ct->proto.udp.stream_ts)) { extra = timeouts[UDP_CT_REPLIED]; stream = (status & IPS_ASSURED) == 0; } nf_ct_refresh_acct(ct, ctinfo, skb, extra); /* never set ASSURED for IPS_NAT_CLASH, they time out soon */ if (unlikely((status & IPS_NAT_CLASH))) return NF_ACCEPT; /* Also, more likely to be important, and not a probe */ if (stream && !test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } else { nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]); } return NF_ACCEPT; } #ifdef CONFIG_NF_CT_PROTO_UDPLITE static void udplite_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { nf_l4proto_log_invalid(skb, state, IPPROTO_UDPLITE, "%s", msg); } static bool udplite_error(struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { unsigned int udplen = skb->len - dataoff; const struct udphdr *hdr; struct udphdr _hdr; unsigned int cscov; /* Header is too small? */ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (!hdr) { udplite_error_log(skb, state, "short packet"); return true; } cscov = ntohs(hdr->len); if (cscov == 0) { cscov = udplen; } else if (cscov < sizeof(*hdr) || cscov > udplen) { udplite_error_log(skb, state, "invalid checksum coverage"); return true; } /* UDPLITE mandates checksums */ if (!hdr->check) { udplite_error_log(skb, state, "checksum missing"); return true; } /* Checksum invalid? Ignore. */ if (state->hook == NF_INET_PRE_ROUTING && state->net->ct.sysctl_checksum && nf_checksum_partial(skb, state->hook, dataoff, cscov, IPPROTO_UDP, state->pf)) { udplite_error_log(skb, state, "bad checksum"); return true; } return false; } /* Returns verdict for packet, and may modify conntracktype */ int nf_conntrack_udplite_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { unsigned int *timeouts; if (udplite_error(skb, dataoff, state)) return -NF_ACCEPT; timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = udp_get_timeouts(nf_ct_net(ct)); /* If we've seen traffic both ways, this is some kind of UDP stream. Extend timeout. */ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_REPLIED]); if (unlikely((ct->status & IPS_NAT_CLASH))) return NF_ACCEPT; /* Also, more likely to be important, and not a probe */ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } else { nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]); } return NF_ACCEPT; } #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int udp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeouts = data; struct nf_udp_net *un = nf_udp_pernet(net); if (!timeouts) timeouts = un->timeouts; /* set default timeouts for UDP. */ timeouts[UDP_CT_UNREPLIED] = un->timeouts[UDP_CT_UNREPLIED]; timeouts[UDP_CT_REPLIED] = un->timeouts[UDP_CT_REPLIED]; if (tb[CTA_TIMEOUT_UDP_UNREPLIED]) { timeouts[UDP_CT_UNREPLIED] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDP_UNREPLIED])) * HZ; } if (tb[CTA_TIMEOUT_UDP_REPLIED]) { timeouts[UDP_CT_REPLIED] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDP_REPLIED])) * HZ; } return 0; } static int udp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeouts = data; if (nla_put_be32(skb, CTA_TIMEOUT_UDP_UNREPLIED, htonl(timeouts[UDP_CT_UNREPLIED] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_UDP_REPLIED, htonl(timeouts[UDP_CT_REPLIED] / HZ))) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy udp_timeout_nla_policy[CTA_TIMEOUT_UDP_MAX+1] = { [CTA_TIMEOUT_UDP_UNREPLIED] = { .type = NLA_U32 }, [CTA_TIMEOUT_UDP_REPLIED] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_udp_init_net(struct net *net) { struct nf_udp_net *un = nf_udp_pernet(net); int i; for (i = 0; i < UDP_CT_MAX; i++) un->timeouts[i] = udp_timeouts[i]; #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) un->offload_timeout = 30 * HZ; #endif } const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp = { .l4proto = IPPROTO_UDP, .allow_clash = true, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = udp_timeout_nlattr_to_obj, .obj_to_nlattr = udp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_UDP_MAX, .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, .nla_policy = udp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ }; #ifdef CONFIG_NF_CT_PROTO_UDPLITE const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite = { .l4proto = IPPROTO_UDPLITE, .allow_clash = true, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = udp_timeout_nlattr_to_obj, .obj_to_nlattr = udp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_UDP_MAX, .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, .nla_policy = udp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ }; #endif
2 2 4 7 7 1 6 2 4 2 4 6 5 5 1 2 2 1672 1672 1 19 7 19 3 1 1 1 1 11 3 1 4 5 2 7 4 15 3 9 5 4 10 5 7 7 3 3 3 2 4 6 5 3 2 4 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 // SPDX-License-Identifier: GPL-2.0-or-later /* * NetLabel Unlabeled Support * * This file defines functions for dealing with unlabeled packets for the * NetLabel system. The NetLabel system manages static and dynamic label * mappings for network protocols such as CIPSO and RIPSO. * * Author: Paul Moore <paul@paul-moore.com> */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 - 2008 */ #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/socket.h> #include <linux/string.h> #include <linux/skbuff.h> #include <linux/audit.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/security.h> #include <linux/slab.h> #include <net/sock.h> #include <net/netlink.h> #include <net/genetlink.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/net_namespace.h> #include <net/netlabel.h> #include <asm/bug.h> #include <linux/atomic.h> #include "netlabel_user.h" #include "netlabel_addrlist.h" #include "netlabel_domainhash.h" #include "netlabel_unlabeled.h" #include "netlabel_mgmt.h" /* NOTE: at present we always use init's network namespace since we don't * presently support different namespaces even though the majority of * the functions in this file are "namespace safe" */ /* The unlabeled connection hash table which we use to map network interfaces * and addresses of unlabeled packets to a user specified secid value for the * LSM. The hash table is used to lookup the network interface entry * (struct netlbl_unlhsh_iface) and then the interface entry is used to * lookup an IP address match from an ordered list. If a network interface * match can not be found in the hash table then the default entry * (netlbl_unlhsh_def) is used. The IP address entry list * (struct netlbl_unlhsh_addr) is ordered such that the entries with a * larger netmask come first. */ struct netlbl_unlhsh_tbl { struct list_head *tbl; u32 size; }; #define netlbl_unlhsh_addr4_entry(iter) \ container_of(iter, struct netlbl_unlhsh_addr4, list) struct netlbl_unlhsh_addr4 { u32 secid; struct netlbl_af4list list; struct rcu_head rcu; }; #define netlbl_unlhsh_addr6_entry(iter) \ container_of(iter, struct netlbl_unlhsh_addr6, list) struct netlbl_unlhsh_addr6 { u32 secid; struct netlbl_af6list list; struct rcu_head rcu; }; struct netlbl_unlhsh_iface { int ifindex; struct list_head addr4_list; struct list_head addr6_list; u32 valid; struct list_head list; struct rcu_head rcu; }; /* Argument struct for netlbl_unlhsh_walk() */ struct netlbl_unlhsh_walk_arg { struct netlink_callback *nl_cb; struct sk_buff *skb; u32 seq; }; /* Unlabeled connection hash table */ /* updates should be so rare that having one spinlock for the entire * hash table should be okay */ static DEFINE_SPINLOCK(netlbl_unlhsh_lock); #define netlbl_unlhsh_rcu_deref(p) \ rcu_dereference_check(p, lockdep_is_held(&netlbl_unlhsh_lock)) static struct netlbl_unlhsh_tbl __rcu *netlbl_unlhsh; static struct netlbl_unlhsh_iface __rcu *netlbl_unlhsh_def; /* Accept unlabeled packets flag */ static u8 netlabel_unlabel_acceptflg; /* NetLabel Generic NETLINK unlabeled family */ static struct genl_family netlbl_unlabel_gnl_family; /* NetLabel Netlink attribute policy */ static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = { [NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 }, [NLBL_UNLABEL_A_IPV6ADDR] = { .type = NLA_BINARY, .len = sizeof(struct in6_addr) }, [NLBL_UNLABEL_A_IPV6MASK] = { .type = NLA_BINARY, .len = sizeof(struct in6_addr) }, [NLBL_UNLABEL_A_IPV4ADDR] = { .type = NLA_BINARY, .len = sizeof(struct in_addr) }, [NLBL_UNLABEL_A_IPV4MASK] = { .type = NLA_BINARY, .len = sizeof(struct in_addr) }, [NLBL_UNLABEL_A_IFACE] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [NLBL_UNLABEL_A_SECCTX] = { .type = NLA_BINARY } }; /* * Unlabeled Connection Hash Table Functions */ /** * netlbl_unlhsh_free_iface - Frees an interface entry from the hash table * @entry: the entry's RCU field * * Description: * This function is designed to be used as a callback to the call_rcu() * function so that memory allocated to a hash table interface entry can be * released safely. It is important to note that this function does not free * the IPv4 and IPv6 address lists contained as part of an interface entry. It * is up to the rest of the code to make sure an interface entry is only freed * once it's address lists are empty. * */ static void netlbl_unlhsh_free_iface(struct rcu_head *entry) { struct netlbl_unlhsh_iface *iface; struct netlbl_af4list *iter4; struct netlbl_af4list *tmp4; #if IS_ENABLED(CONFIG_IPV6) struct netlbl_af6list *iter6; struct netlbl_af6list *tmp6; #endif /* IPv6 */ iface = container_of(entry, struct netlbl_unlhsh_iface, rcu); /* no need for locks here since we are the only one with access to this * structure */ netlbl_af4list_foreach_safe(iter4, tmp4, &iface->addr4_list) { netlbl_af4list_remove_entry(iter4); kfree(netlbl_unlhsh_addr4_entry(iter4)); } #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_safe(iter6, tmp6, &iface->addr6_list) { netlbl_af6list_remove_entry(iter6); kfree(netlbl_unlhsh_addr6_entry(iter6)); } #endif /* IPv6 */ kfree(iface); } /** * netlbl_unlhsh_hash - Hashing function for the hash table * @ifindex: the network interface/device to hash * * Description: * This is the hashing function for the unlabeled hash table, it returns the * bucket number for the given device/interface. The caller is responsible for * ensuring that the hash table is protected with either a RCU read lock or * the hash table lock. * */ static u32 netlbl_unlhsh_hash(int ifindex) { return ifindex & (netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->size - 1); } /** * netlbl_unlhsh_search_iface - Search for a matching interface entry * @ifindex: the network interface * * Description: * Searches the unlabeled connection hash table and returns a pointer to the * interface entry which matches @ifindex, otherwise NULL is returned. The * caller is responsible for ensuring that the hash table is protected with * either a RCU read lock or the hash table lock. * */ static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex) { u32 bkt; struct list_head *bkt_list; struct netlbl_unlhsh_iface *iter; bkt = netlbl_unlhsh_hash(ifindex); bkt_list = &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt]; list_for_each_entry_rcu(iter, bkt_list, list, lockdep_is_held(&netlbl_unlhsh_lock)) if (iter->valid && iter->ifindex == ifindex) return iter; return NULL; } /** * netlbl_unlhsh_add_addr4 - Add a new IPv4 address entry to the hash table * @iface: the associated interface entry * @addr: IPv4 address in network byte order * @mask: IPv4 address mask in network byte order * @secid: LSM secid value for entry * * Description: * Add a new address entry into the unlabeled connection hash table using the * interface entry specified by @iface. On success zero is returned, otherwise * a negative value is returned. * */ static int netlbl_unlhsh_add_addr4(struct netlbl_unlhsh_iface *iface, const struct in_addr *addr, const struct in_addr *mask, u32 secid) { int ret_val; struct netlbl_unlhsh_addr4 *entry; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (entry == NULL) return -ENOMEM; entry->list.addr = addr->s_addr & mask->s_addr; entry->list.mask = mask->s_addr; entry->list.valid = 1; entry->secid = secid; spin_lock(&netlbl_unlhsh_lock); ret_val = netlbl_af4list_add(&entry->list, &iface->addr4_list); spin_unlock(&netlbl_unlhsh_lock); if (ret_val != 0) kfree(entry); return ret_val; } #if IS_ENABLED(CONFIG_IPV6) /** * netlbl_unlhsh_add_addr6 - Add a new IPv6 address entry to the hash table * @iface: the associated interface entry * @addr: IPv6 address in network byte order * @mask: IPv6 address mask in network byte order * @secid: LSM secid value for entry * * Description: * Add a new address entry into the unlabeled connection hash table using the * interface entry specified by @iface. On success zero is returned, otherwise * a negative value is returned. * */ static int netlbl_unlhsh_add_addr6(struct netlbl_unlhsh_iface *iface, const struct in6_addr *addr, const struct in6_addr *mask, u32 secid) { int ret_val; struct netlbl_unlhsh_addr6 *entry; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (entry == NULL) return -ENOMEM; entry->list.addr = *addr; entry->list.addr.s6_addr32[0] &= mask->s6_addr32[0]; entry->list.addr.s6_addr32[1] &= mask->s6_addr32[1]; entry->list.addr.s6_addr32[2] &= mask->s6_addr32[2]; entry->list.addr.s6_addr32[3] &= mask->s6_addr32[3]; entry->list.mask = *mask; entry->list.valid = 1; entry->secid = secid; spin_lock(&netlbl_unlhsh_lock); ret_val = netlbl_af6list_add(&entry->list, &iface->addr6_list); spin_unlock(&netlbl_unlhsh_lock); if (ret_val != 0) kfree(entry); return 0; } #endif /* IPv6 */ /** * netlbl_unlhsh_add_iface - Adds a new interface entry to the hash table * @ifindex: network interface * * Description: * Add a new, empty, interface entry into the unlabeled connection hash table. * On success a pointer to the new interface entry is returned, on failure NULL * is returned. * */ static struct netlbl_unlhsh_iface *netlbl_unlhsh_add_iface(int ifindex) { u32 bkt; struct netlbl_unlhsh_iface *iface; iface = kzalloc(sizeof(*iface), GFP_ATOMIC); if (iface == NULL) return NULL; iface->ifindex = ifindex; INIT_LIST_HEAD(&iface->addr4_list); INIT_LIST_HEAD(&iface->addr6_list); iface->valid = 1; spin_lock(&netlbl_unlhsh_lock); if (ifindex > 0) { bkt = netlbl_unlhsh_hash(ifindex); if (netlbl_unlhsh_search_iface(ifindex) != NULL) goto add_iface_failure; list_add_tail_rcu(&iface->list, &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt]); } else { INIT_LIST_HEAD(&iface->list); if (netlbl_unlhsh_rcu_deref(netlbl_unlhsh_def) != NULL) goto add_iface_failure; rcu_assign_pointer(netlbl_unlhsh_def, iface); } spin_unlock(&netlbl_unlhsh_lock); return iface; add_iface_failure: spin_unlock(&netlbl_unlhsh_lock); kfree(iface); return NULL; } /** * netlbl_unlhsh_add - Adds a new entry to the unlabeled connection hash table * @net: network namespace * @dev_name: interface name * @addr: IP address in network byte order * @mask: address mask in network byte order * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6) * @secid: LSM secid value for the entry * @audit_info: NetLabel audit information * * Description: * Adds a new entry to the unlabeled connection hash table. Returns zero on * success, negative values on failure. * */ int netlbl_unlhsh_add(struct net *net, const char *dev_name, const void *addr, const void *mask, u32 addr_len, u32 secid, struct netlbl_audit *audit_info) { int ret_val; int ifindex; struct net_device *dev; struct netlbl_unlhsh_iface *iface; struct audit_buffer *audit_buf = NULL; char *secctx = NULL; u32 secctx_len; if (addr_len != sizeof(struct in_addr) && addr_len != sizeof(struct in6_addr)) return -EINVAL; rcu_read_lock(); if (dev_name != NULL) { dev = dev_get_by_name_rcu(net, dev_name); if (dev == NULL) { ret_val = -ENODEV; goto unlhsh_add_return; } ifindex = dev->ifindex; iface = netlbl_unlhsh_search_iface(ifindex); } else { ifindex = 0; iface = rcu_dereference(netlbl_unlhsh_def); } if (iface == NULL) { iface = netlbl_unlhsh_add_iface(ifindex); if (iface == NULL) { ret_val = -ENOMEM; goto unlhsh_add_return; } } audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCADD, audit_info); switch (addr_len) { case sizeof(struct in_addr): { const struct in_addr *addr4 = addr; const struct in_addr *mask4 = mask; ret_val = netlbl_unlhsh_add_addr4(iface, addr4, mask4, secid); if (audit_buf != NULL) netlbl_af4list_audit_addr(audit_buf, 1, dev_name, addr4->s_addr, mask4->s_addr); break; } #if IS_ENABLED(CONFIG_IPV6) case sizeof(struct in6_addr): { const struct in6_addr *addr6 = addr; const struct in6_addr *mask6 = mask; ret_val = netlbl_unlhsh_add_addr6(iface, addr6, mask6, secid); if (audit_buf != NULL) netlbl_af6list_audit_addr(audit_buf, 1, dev_name, addr6, mask6); break; } #endif /* IPv6 */ default: ret_val = -EINVAL; } if (ret_val == 0) atomic_inc(&netlabel_mgmt_protocount); unlhsh_add_return: rcu_read_unlock(); if (audit_buf != NULL) { if (security_secid_to_secctx(secid, &secctx, &secctx_len) == 0) { audit_log_format(audit_buf, " sec_obj=%s", secctx); security_release_secctx(secctx, secctx_len); } audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0); audit_log_end(audit_buf); } return ret_val; } /** * netlbl_unlhsh_remove_addr4 - Remove an IPv4 address entry * @net: network namespace * @iface: interface entry * @addr: IP address * @mask: IP address mask * @audit_info: NetLabel audit information * * Description: * Remove an IP address entry from the unlabeled connection hash table. * Returns zero on success, negative values on failure. * */ static int netlbl_unlhsh_remove_addr4(struct net *net, struct netlbl_unlhsh_iface *iface, const struct in_addr *addr, const struct in_addr *mask, struct netlbl_audit *audit_info) { struct netlbl_af4list *list_entry; struct netlbl_unlhsh_addr4 *entry; struct audit_buffer *audit_buf; struct net_device *dev; char *secctx; u32 secctx_len; spin_lock(&netlbl_unlhsh_lock); list_entry = netlbl_af4list_remove(addr->s_addr, mask->s_addr, &iface->addr4_list); spin_unlock(&netlbl_unlhsh_lock); if (list_entry != NULL) entry = netlbl_unlhsh_addr4_entry(list_entry); else entry = NULL; audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL, audit_info); if (audit_buf != NULL) { dev = dev_get_by_index(net, iface->ifindex); netlbl_af4list_audit_addr(audit_buf, 1, (dev != NULL ? dev->name : NULL), addr->s_addr, mask->s_addr); dev_put(dev); if (entry != NULL && security_secid_to_secctx(entry->secid, &secctx, &secctx_len) == 0) { audit_log_format(audit_buf, " sec_obj=%s", secctx); security_release_secctx(secctx, secctx_len); } audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0); audit_log_end(audit_buf); } if (entry == NULL) return -ENOENT; kfree_rcu(entry, rcu); return 0; } #if IS_ENABLED(CONFIG_IPV6) /** * netlbl_unlhsh_remove_addr6 - Remove an IPv6 address entry * @net: network namespace * @iface: interface entry * @addr: IP address * @mask: IP address mask * @audit_info: NetLabel audit information * * Description: * Remove an IP address entry from the unlabeled connection hash table. * Returns zero on success, negative values on failure. * */ static int netlbl_unlhsh_remove_addr6(struct net *net, struct netlbl_unlhsh_iface *iface, const struct in6_addr *addr, const struct in6_addr *mask, struct netlbl_audit *audit_info) { struct netlbl_af6list *list_entry; struct netlbl_unlhsh_addr6 *entry; struct audit_buffer *audit_buf; struct net_device *dev; char *secctx; u32 secctx_len; spin_lock(&netlbl_unlhsh_lock); list_entry = netlbl_af6list_remove(addr, mask, &iface->addr6_list); spin_unlock(&netlbl_unlhsh_lock); if (list_entry != NULL) entry = netlbl_unlhsh_addr6_entry(list_entry); else entry = NULL; audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL, audit_info); if (audit_buf != NULL) { dev = dev_get_by_index(net, iface->ifindex); netlbl_af6list_audit_addr(audit_buf, 1, (dev != NULL ? dev->name : NULL), addr, mask); dev_put(dev); if (entry != NULL && security_secid_to_secctx(entry->secid, &secctx, &secctx_len) == 0) { audit_log_format(audit_buf, " sec_obj=%s", secctx); security_release_secctx(secctx, secctx_len); } audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0); audit_log_end(audit_buf); } if (entry == NULL) return -ENOENT; kfree_rcu(entry, rcu); return 0; } #endif /* IPv6 */ /** * netlbl_unlhsh_condremove_iface - Remove an interface entry * @iface: the interface entry * * Description: * Remove an interface entry from the unlabeled connection hash table if it is * empty. An interface entry is considered to be empty if there are no * address entries assigned to it. * */ static void netlbl_unlhsh_condremove_iface(struct netlbl_unlhsh_iface *iface) { struct netlbl_af4list *iter4; #if IS_ENABLED(CONFIG_IPV6) struct netlbl_af6list *iter6; #endif /* IPv6 */ spin_lock(&netlbl_unlhsh_lock); netlbl_af4list_foreach_rcu(iter4, &iface->addr4_list) goto unlhsh_condremove_failure; #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(iter6, &iface->addr6_list) goto unlhsh_condremove_failure; #endif /* IPv6 */ iface->valid = 0; if (iface->ifindex > 0) list_del_rcu(&iface->list); else RCU_INIT_POINTER(netlbl_unlhsh_def, NULL); spin_unlock(&netlbl_unlhsh_lock); call_rcu(&iface->rcu, netlbl_unlhsh_free_iface); return; unlhsh_condremove_failure: spin_unlock(&netlbl_unlhsh_lock); } /** * netlbl_unlhsh_remove - Remove an entry from the unlabeled hash table * @net: network namespace * @dev_name: interface name * @addr: IP address in network byte order * @mask: address mask in network byte order * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6) * @audit_info: NetLabel audit information * * Description: * Removes and existing entry from the unlabeled connection hash table. * Returns zero on success, negative values on failure. * */ int netlbl_unlhsh_remove(struct net *net, const char *dev_name, const void *addr, const void *mask, u32 addr_len, struct netlbl_audit *audit_info) { int ret_val; struct net_device *dev; struct netlbl_unlhsh_iface *iface; if (addr_len != sizeof(struct in_addr) && addr_len != sizeof(struct in6_addr)) return -EINVAL; rcu_read_lock(); if (dev_name != NULL) { dev = dev_get_by_name_rcu(net, dev_name); if (dev == NULL) { ret_val = -ENODEV; goto unlhsh_remove_return; } iface = netlbl_unlhsh_search_iface(dev->ifindex); } else iface = rcu_dereference(netlbl_unlhsh_def); if (iface == NULL) { ret_val = -ENOENT; goto unlhsh_remove_return; } switch (addr_len) { case sizeof(struct in_addr): ret_val = netlbl_unlhsh_remove_addr4(net, iface, addr, mask, audit_info); break; #if IS_ENABLED(CONFIG_IPV6) case sizeof(struct in6_addr): ret_val = netlbl_unlhsh_remove_addr6(net, iface, addr, mask, audit_info); break; #endif /* IPv6 */ default: ret_val = -EINVAL; } if (ret_val == 0) { netlbl_unlhsh_condremove_iface(iface); atomic_dec(&netlabel_mgmt_protocount); } unlhsh_remove_return: rcu_read_unlock(); return ret_val; } /* * General Helper Functions */ /** * netlbl_unlhsh_netdev_handler - Network device notification handler * @this: notifier block * @event: the event * @ptr: the netdevice notifier info (cast to void) * * Description: * Handle network device events, although at present all we care about is a * network device going away. In the case of a device going away we clear any * related entries from the unlabeled connection hash table. * */ static int netlbl_unlhsh_netdev_handler(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netlbl_unlhsh_iface *iface = NULL; if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; /* XXX - should this be a check for NETDEV_DOWN or _UNREGISTER? */ if (event == NETDEV_DOWN) { spin_lock(&netlbl_unlhsh_lock); iface = netlbl_unlhsh_search_iface(dev->ifindex); if (iface != NULL && iface->valid) { iface->valid = 0; list_del_rcu(&iface->list); } else iface = NULL; spin_unlock(&netlbl_unlhsh_lock); } if (iface != NULL) call_rcu(&iface->rcu, netlbl_unlhsh_free_iface); return NOTIFY_DONE; } /** * netlbl_unlabel_acceptflg_set - Set the unlabeled accept flag * @value: desired value * @audit_info: NetLabel audit information * * Description: * Set the value of the unlabeled accept flag to @value. * */ static void netlbl_unlabel_acceptflg_set(u8 value, struct netlbl_audit *audit_info) { struct audit_buffer *audit_buf; u8 old_val; old_val = netlabel_unlabel_acceptflg; netlabel_unlabel_acceptflg = value; audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_ALLOW, audit_info); if (audit_buf != NULL) { audit_log_format(audit_buf, " unlbl_accept=%u old=%u", value, old_val); audit_log_end(audit_buf); } } /** * netlbl_unlabel_addrinfo_get - Get the IPv4/6 address information * @info: the Generic NETLINK info block * @addr: the IP address * @mask: the IP address mask * @len: the address length * * Description: * Examine the Generic NETLINK message and extract the IP address information. * Returns zero on success, negative values on failure. * */ static int netlbl_unlabel_addrinfo_get(struct genl_info *info, void **addr, void **mask, u32 *len) { u32 addr_len; if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR] && info->attrs[NLBL_UNLABEL_A_IPV4MASK]) { addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]); if (addr_len != sizeof(struct in_addr) && addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK])) return -EINVAL; *len = addr_len; *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]); *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4MASK]); return 0; } else if (info->attrs[NLBL_UNLABEL_A_IPV6ADDR]) { addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]); if (addr_len != sizeof(struct in6_addr) && addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV6MASK])) return -EINVAL; *len = addr_len; *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]); *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6MASK]); return 0; } return -EINVAL; } /* * NetLabel Command Handlers */ /** * netlbl_unlabel_accept - Handle an ACCEPT message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated ACCEPT message and set the accept flag accordingly. * Returns zero on success, negative values on failure. * */ static int netlbl_unlabel_accept(struct sk_buff *skb, struct genl_info *info) { u8 value; struct netlbl_audit audit_info; if (info->attrs[NLBL_UNLABEL_A_ACPTFLG]) { value = nla_get_u8(info->attrs[NLBL_UNLABEL_A_ACPTFLG]); if (value == 1 || value == 0) { netlbl_netlink_auditinfo(&audit_info); netlbl_unlabel_acceptflg_set(value, &audit_info); return 0; } } return -EINVAL; } /** * netlbl_unlabel_list - Handle a LIST message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated LIST message and respond with the current status. * Returns zero on success, negative values on failure. * */ static int netlbl_unlabel_list(struct sk_buff *skb, struct genl_info *info) { int ret_val = -EINVAL; struct sk_buff *ans_skb; void *data; ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (ans_skb == NULL) goto list_failure; data = genlmsg_put_reply(ans_skb, info, &netlbl_unlabel_gnl_family, 0, NLBL_UNLABEL_C_LIST); if (data == NULL) { ret_val = -ENOMEM; goto list_failure; } ret_val = nla_put_u8(ans_skb, NLBL_UNLABEL_A_ACPTFLG, netlabel_unlabel_acceptflg); if (ret_val != 0) goto list_failure; genlmsg_end(ans_skb, data); return genlmsg_reply(ans_skb, info); list_failure: kfree_skb(ans_skb); return ret_val; } /** * netlbl_unlabel_staticadd - Handle a STATICADD message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated STATICADD message and add a new unlabeled * connection entry to the hash table. Returns zero on success, negative * values on failure. * */ static int netlbl_unlabel_staticadd(struct sk_buff *skb, struct genl_info *info) { int ret_val; char *dev_name; void *addr; void *mask; u32 addr_len; u32 secid; struct netlbl_audit audit_info; /* Don't allow users to add both IPv4 and IPv6 addresses for a * single entry. However, allow users to create two entries, one each * for IPv4 and IPv6, with the same LSM security context which should * achieve the same result. */ if (!info->attrs[NLBL_UNLABEL_A_SECCTX] || !info->attrs[NLBL_UNLABEL_A_IFACE] || !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) return ret_val; dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]); ret_val = security_secctx_to_secid( nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]), nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]), &secid); if (ret_val != 0) return ret_val; return netlbl_unlhsh_add(&init_net, dev_name, addr, mask, addr_len, secid, &audit_info); } /** * netlbl_unlabel_staticadddef - Handle a STATICADDDEF message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated STATICADDDEF message and add a new default * unlabeled connection entry. Returns zero on success, negative values on * failure. * */ static int netlbl_unlabel_staticadddef(struct sk_buff *skb, struct genl_info *info) { int ret_val; void *addr; void *mask; u32 addr_len; u32 secid; struct netlbl_audit audit_info; /* Don't allow users to add both IPv4 and IPv6 addresses for a * single entry. However, allow users to create two entries, one each * for IPv4 and IPv6, with the same LSM security context which should * achieve the same result. */ if (!info->attrs[NLBL_UNLABEL_A_SECCTX] || !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) return ret_val; ret_val = security_secctx_to_secid( nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]), nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]), &secid); if (ret_val != 0) return ret_val; return netlbl_unlhsh_add(&init_net, NULL, addr, mask, addr_len, secid, &audit_info); } /** * netlbl_unlabel_staticremove - Handle a STATICREMOVE message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated STATICREMOVE message and remove the specified * unlabeled connection entry. Returns zero on success, negative values on * failure. * */ static int netlbl_unlabel_staticremove(struct sk_buff *skb, struct genl_info *info) { int ret_val; char *dev_name; void *addr; void *mask; u32 addr_len; struct netlbl_audit audit_info; /* See the note in netlbl_unlabel_staticadd() about not allowing both * IPv4 and IPv6 in the same entry. */ if (!info->attrs[NLBL_UNLABEL_A_IFACE] || !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) return ret_val; dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]); return netlbl_unlhsh_remove(&init_net, dev_name, addr, mask, addr_len, &audit_info); } /** * netlbl_unlabel_staticremovedef - Handle a STATICREMOVEDEF message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated STATICREMOVEDEF message and remove the default * unlabeled connection entry. Returns zero on success, negative values on * failure. * */ static int netlbl_unlabel_staticremovedef(struct sk_buff *skb, struct genl_info *info) { int ret_val; void *addr; void *mask; u32 addr_len; struct netlbl_audit audit_info; /* See the note in netlbl_unlabel_staticadd() about not allowing both * IPv4 and IPv6 in the same entry. */ if (!((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) return -EINVAL; netlbl_netlink_auditinfo(&audit_info); ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); if (ret_val != 0) return ret_val; return netlbl_unlhsh_remove(&init_net, NULL, addr, mask, addr_len, &audit_info); } /** * netlbl_unlabel_staticlist_gen - Generate messages for STATICLIST[DEF] * @cmd: command/message * @iface: the interface entry * @addr4: the IPv4 address entry * @addr6: the IPv6 address entry * @arg: the netlbl_unlhsh_walk_arg structure * * Description: * This function is designed to be used to generate a response for a * STATICLIST or STATICLISTDEF message. When called either @addr4 or @addr6 * can be specified, not both, the other unspecified entry should be set to * NULL by the caller. Returns the size of the message on success, negative * values on failure. * */ static int netlbl_unlabel_staticlist_gen(u32 cmd, const struct netlbl_unlhsh_iface *iface, const struct netlbl_unlhsh_addr4 *addr4, const struct netlbl_unlhsh_addr6 *addr6, void *arg) { int ret_val = -ENOMEM; struct netlbl_unlhsh_walk_arg *cb_arg = arg; struct net_device *dev; void *data; u32 secid; char *secctx; u32 secctx_len; data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).portid, cb_arg->seq, &netlbl_unlabel_gnl_family, NLM_F_MULTI, cmd); if (data == NULL) goto list_cb_failure; if (iface->ifindex > 0) { dev = dev_get_by_index(&init_net, iface->ifindex); if (!dev) { ret_val = -ENODEV; goto list_cb_failure; } ret_val = nla_put_string(cb_arg->skb, NLBL_UNLABEL_A_IFACE, dev->name); dev_put(dev); if (ret_val != 0) goto list_cb_failure; } if (addr4) { struct in_addr addr_struct; addr_struct.s_addr = addr4->list.addr; ret_val = nla_put_in_addr(cb_arg->skb, NLBL_UNLABEL_A_IPV4ADDR, addr_struct.s_addr); if (ret_val != 0) goto list_cb_failure; addr_struct.s_addr = addr4->list.mask; ret_val = nla_put_in_addr(cb_arg->skb, NLBL_UNLABEL_A_IPV4MASK, addr_struct.s_addr); if (ret_val != 0) goto list_cb_failure; secid = addr4->secid; } else { ret_val = nla_put_in6_addr(cb_arg->skb, NLBL_UNLABEL_A_IPV6ADDR, &addr6->list.addr); if (ret_val != 0) goto list_cb_failure; ret_val = nla_put_in6_addr(cb_arg->skb, NLBL_UNLABEL_A_IPV6MASK, &addr6->list.mask); if (ret_val != 0) goto list_cb_failure; secid = addr6->secid; } ret_val = security_secid_to_secctx(secid, &secctx, &secctx_len); if (ret_val != 0) goto list_cb_failure; ret_val = nla_put(cb_arg->skb, NLBL_UNLABEL_A_SECCTX, secctx_len, secctx); security_release_secctx(secctx, secctx_len); if (ret_val != 0) goto list_cb_failure; cb_arg->seq++; genlmsg_end(cb_arg->skb, data); return 0; list_cb_failure: genlmsg_cancel(cb_arg->skb, data); return ret_val; } /** * netlbl_unlabel_staticlist - Handle a STATICLIST message * @skb: the NETLINK buffer * @cb: the NETLINK callback * * Description: * Process a user generated STATICLIST message and dump the unlabeled * connection hash table in a form suitable for use in a kernel generated * STATICLIST message. Returns the length of @skb. * */ static int netlbl_unlabel_staticlist(struct sk_buff *skb, struct netlink_callback *cb) { struct netlbl_unlhsh_walk_arg cb_arg; u32 skip_bkt = cb->args[0]; u32 skip_chain = cb->args[1]; u32 skip_addr4 = cb->args[2]; u32 iter_bkt, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0; struct netlbl_unlhsh_iface *iface; struct list_head *iter_list; struct netlbl_af4list *addr4; #if IS_ENABLED(CONFIG_IPV6) u32 skip_addr6 = cb->args[3]; struct netlbl_af6list *addr6; #endif cb_arg.nl_cb = cb; cb_arg.skb = skb; cb_arg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); for (iter_bkt = skip_bkt; iter_bkt < rcu_dereference(netlbl_unlhsh)->size; iter_bkt++) { iter_list = &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt]; list_for_each_entry_rcu(iface, iter_list, list) { if (!iface->valid || iter_chain++ < skip_chain) continue; netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) { if (iter_addr4++ < skip_addr4) continue; if (netlbl_unlabel_staticlist_gen( NLBL_UNLABEL_C_STATICLIST, iface, netlbl_unlhsh_addr4_entry(addr4), NULL, &cb_arg) < 0) { iter_addr4--; iter_chain--; goto unlabel_staticlist_return; } } iter_addr4 = 0; skip_addr4 = 0; #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) { if (iter_addr6++ < skip_addr6) continue; if (netlbl_unlabel_staticlist_gen( NLBL_UNLABEL_C_STATICLIST, iface, NULL, netlbl_unlhsh_addr6_entry(addr6), &cb_arg) < 0) { iter_addr6--; iter_chain--; goto unlabel_staticlist_return; } } iter_addr6 = 0; skip_addr6 = 0; #endif /* IPv6 */ } iter_chain = 0; skip_chain = 0; } unlabel_staticlist_return: rcu_read_unlock(); cb->args[0] = iter_bkt; cb->args[1] = iter_chain; cb->args[2] = iter_addr4; cb->args[3] = iter_addr6; return skb->len; } /** * netlbl_unlabel_staticlistdef - Handle a STATICLISTDEF message * @skb: the NETLINK buffer * @cb: the NETLINK callback * * Description: * Process a user generated STATICLISTDEF message and dump the default * unlabeled connection entry in a form suitable for use in a kernel generated * STATICLISTDEF message. Returns the length of @skb. * */ static int netlbl_unlabel_staticlistdef(struct sk_buff *skb, struct netlink_callback *cb) { struct netlbl_unlhsh_walk_arg cb_arg; struct netlbl_unlhsh_iface *iface; u32 iter_addr4 = 0, iter_addr6 = 0; struct netlbl_af4list *addr4; #if IS_ENABLED(CONFIG_IPV6) struct netlbl_af6list *addr6; #endif cb_arg.nl_cb = cb; cb_arg.skb = skb; cb_arg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); iface = rcu_dereference(netlbl_unlhsh_def); if (iface == NULL || !iface->valid) goto unlabel_staticlistdef_return; netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) { if (iter_addr4++ < cb->args[0]) continue; if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, iface, netlbl_unlhsh_addr4_entry(addr4), NULL, &cb_arg) < 0) { iter_addr4--; goto unlabel_staticlistdef_return; } } #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) { if (iter_addr6++ < cb->args[1]) continue; if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, iface, NULL, netlbl_unlhsh_addr6_entry(addr6), &cb_arg) < 0) { iter_addr6--; goto unlabel_staticlistdef_return; } } #endif /* IPv6 */ unlabel_staticlistdef_return: rcu_read_unlock(); cb->args[0] = iter_addr4; cb->args[1] = iter_addr6; return skb->len; } /* * NetLabel Generic NETLINK Command Definitions */ static const struct genl_small_ops netlbl_unlabel_genl_ops[] = { { .cmd = NLBL_UNLABEL_C_STATICADD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_unlabel_staticadd, .dumpit = NULL, }, { .cmd = NLBL_UNLABEL_C_STATICREMOVE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_unlabel_staticremove, .dumpit = NULL, }, { .cmd = NLBL_UNLABEL_C_STATICLIST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, .doit = NULL, .dumpit = netlbl_unlabel_staticlist, }, { .cmd = NLBL_UNLABEL_C_STATICADDDEF, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_unlabel_staticadddef, .dumpit = NULL, }, { .cmd = NLBL_UNLABEL_C_STATICREMOVEDEF, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_unlabel_staticremovedef, .dumpit = NULL, }, { .cmd = NLBL_UNLABEL_C_STATICLISTDEF, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, .doit = NULL, .dumpit = netlbl_unlabel_staticlistdef, }, { .cmd = NLBL_UNLABEL_C_ACCEPT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_unlabel_accept, .dumpit = NULL, }, { .cmd = NLBL_UNLABEL_C_LIST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, .doit = netlbl_unlabel_list, .dumpit = NULL, }, }; static struct genl_family netlbl_unlabel_gnl_family __ro_after_init = { .hdrsize = 0, .name = NETLBL_NLTYPE_UNLABELED_NAME, .version = NETLBL_PROTO_VERSION, .maxattr = NLBL_UNLABEL_A_MAX, .policy = netlbl_unlabel_genl_policy, .module = THIS_MODULE, .small_ops = netlbl_unlabel_genl_ops, .n_small_ops = ARRAY_SIZE(netlbl_unlabel_genl_ops), .resv_start_op = NLBL_UNLABEL_C_STATICLISTDEF + 1, }; /* * NetLabel Generic NETLINK Protocol Functions */ /** * netlbl_unlabel_genl_init - Register the Unlabeled NetLabel component * * Description: * Register the unlabeled packet NetLabel component with the Generic NETLINK * mechanism. Returns zero on success, negative values on failure. * */ int __init netlbl_unlabel_genl_init(void) { return genl_register_family(&netlbl_unlabel_gnl_family); } /* * NetLabel KAPI Hooks */ static struct notifier_block netlbl_unlhsh_netdev_notifier = { .notifier_call = netlbl_unlhsh_netdev_handler, }; /** * netlbl_unlabel_init - Initialize the unlabeled connection hash table * @size: the number of bits to use for the hash buckets * * Description: * Initializes the unlabeled connection hash table and registers a network * device notification handler. This function should only be called by the * NetLabel subsystem itself during initialization. Returns zero on success, * non-zero values on error. * */ int __init netlbl_unlabel_init(u32 size) { u32 iter; struct netlbl_unlhsh_tbl *hsh_tbl; if (size == 0) return -EINVAL; hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL); if (hsh_tbl == NULL) return -ENOMEM; hsh_tbl->size = 1 << size; hsh_tbl->tbl = kcalloc(hsh_tbl->size, sizeof(struct list_head), GFP_KERNEL); if (hsh_tbl->tbl == NULL) { kfree(hsh_tbl); return -ENOMEM; } for (iter = 0; iter < hsh_tbl->size; iter++) INIT_LIST_HEAD(&hsh_tbl->tbl[iter]); spin_lock(&netlbl_unlhsh_lock); rcu_assign_pointer(netlbl_unlhsh, hsh_tbl); spin_unlock(&netlbl_unlhsh_lock); register_netdevice_notifier(&netlbl_unlhsh_netdev_notifier); return 0; } /** * netlbl_unlabel_getattr - Get the security attributes for an unlabled packet * @skb: the packet * @family: protocol family * @secattr: the security attributes * * Description: * Determine the security attributes, if any, for an unlabled packet and return * them in @secattr. Returns zero on success and negative values on failure. * */ int netlbl_unlabel_getattr(const struct sk_buff *skb, u16 family, struct netlbl_lsm_secattr *secattr) { struct netlbl_unlhsh_iface *iface; rcu_read_lock(); iface = netlbl_unlhsh_search_iface(skb->skb_iif); if (iface == NULL) iface = rcu_dereference(netlbl_unlhsh_def); if (iface == NULL || !iface->valid) goto unlabel_getattr_nolabel; #if IS_ENABLED(CONFIG_IPV6) /* When resolving a fallback label, check the sk_buff version as * it is possible (e.g. SCTP) to have family = PF_INET6 while * receiving ip_hdr(skb)->version = 4. */ if (family == PF_INET6 && ip_hdr(skb)->version == 4) family = PF_INET; #endif /* IPv6 */ switch (family) { case PF_INET: { struct iphdr *hdr4; struct netlbl_af4list *addr4; hdr4 = ip_hdr(skb); addr4 = netlbl_af4list_search(hdr4->saddr, &iface->addr4_list); if (addr4 == NULL) goto unlabel_getattr_nolabel; secattr->attr.secid = netlbl_unlhsh_addr4_entry(addr4)->secid; break; } #if IS_ENABLED(CONFIG_IPV6) case PF_INET6: { struct ipv6hdr *hdr6; struct netlbl_af6list *addr6; hdr6 = ipv6_hdr(skb); addr6 = netlbl_af6list_search(&hdr6->saddr, &iface->addr6_list); if (addr6 == NULL) goto unlabel_getattr_nolabel; secattr->attr.secid = netlbl_unlhsh_addr6_entry(addr6)->secid; break; } #endif /* IPv6 */ default: goto unlabel_getattr_nolabel; } rcu_read_unlock(); secattr->flags |= NETLBL_SECATTR_SECID; secattr->type = NETLBL_NLTYPE_UNLABELED; return 0; unlabel_getattr_nolabel: rcu_read_unlock(); if (netlabel_unlabel_acceptflg == 0) return -ENOMSG; secattr->type = NETLBL_NLTYPE_UNLABELED; return 0; } /** * netlbl_unlabel_defconf - Set the default config to allow unlabeled packets * * Description: * Set the default NetLabel configuration to allow incoming unlabeled packets * and to send unlabeled network traffic by default. * */ int __init netlbl_unlabel_defconf(void) { int ret_val; struct netlbl_dom_map *entry; struct netlbl_audit audit_info; /* Only the kernel is allowed to call this function and the only time * it is called is at bootup before the audit subsystem is reporting * messages so don't worry to much about these values. */ security_current_getsecid_subj(&audit_info.secid); audit_info.loginuid = GLOBAL_ROOT_UID; audit_info.sessionid = 0; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (entry == NULL) return -ENOMEM; entry->family = AF_UNSPEC; entry->def.type = NETLBL_NLTYPE_UNLABELED; ret_val = netlbl_domhsh_add_default(entry, &audit_info); if (ret_val != 0) return ret_val; netlbl_unlabel_acceptflg_set(1, &audit_info); return 0; }
5 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 5 1 3 3 3 3 3 5 2 3 3 3 3 3 3 3 2 2 5 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 // SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/super.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ #include <linux/module.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/fs_context.h> #include <linux/sched/mm.h> #include <linux/statfs.h> #include <linux/buffer_head.h> #include <linux/kthread.h> #include <linux/parser.h> #include <linux/mount.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/random.h> #include <linux/exportfs.h> #include <linux/blkdev.h> #include <linux/quotaops.h> #include <linux/f2fs_fs.h> #include <linux/sysfs.h> #include <linux/quota.h> #include <linux/unicode.h> #include <linux/part_stat.h> #include <linux/zstd.h> #include <linux/lz4.h> #include "f2fs.h" #include "node.h" #include "segment.h" #include "xattr.h" #include "gc.h" #include "iostat.h" #define CREATE_TRACE_POINTS #include <trace/events/f2fs.h> static struct kmem_cache *f2fs_inode_cachep; #ifdef CONFIG_F2FS_FAULT_INJECTION const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_KMALLOC] = "kmalloc", [FAULT_KVMALLOC] = "kvmalloc", [FAULT_PAGE_ALLOC] = "page alloc", [FAULT_PAGE_GET] = "page get", [FAULT_ALLOC_NID] = "alloc nid", [FAULT_ORPHAN] = "orphan", [FAULT_BLOCK] = "no more block", [FAULT_DIR_DEPTH] = "too big dir depth", [FAULT_EVICT_INODE] = "evict_inode fail", [FAULT_TRUNCATE] = "truncate fail", [FAULT_READ_IO] = "read IO error", [FAULT_CHECKPOINT] = "checkpoint error", [FAULT_DISCARD] = "discard error", [FAULT_WRITE_IO] = "write IO error", [FAULT_SLAB_ALLOC] = "slab alloc", [FAULT_DQUOT_INIT] = "dquot initialize", [FAULT_LOCK_OP] = "lock_op", [FAULT_BLKADDR] = "invalid blkaddr", }; void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, unsigned int type) { struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; if (rate) { atomic_set(&ffi->inject_ops, 0); ffi->inject_rate = rate; } if (type) ffi->inject_type = type; if (!rate && !type) memset(ffi, 0, sizeof(struct f2fs_fault_info)); } #endif /* f2fs-wide shrinker description */ static struct shrinker *f2fs_shrinker_info; static int __init f2fs_init_shrinker(void) { f2fs_shrinker_info = shrinker_alloc(0, "f2fs-shrinker"); if (!f2fs_shrinker_info) return -ENOMEM; f2fs_shrinker_info->count_objects = f2fs_shrink_count; f2fs_shrinker_info->scan_objects = f2fs_shrink_scan; shrinker_register(f2fs_shrinker_info); return 0; } static void f2fs_exit_shrinker(void) { shrinker_free(f2fs_shrinker_info); } enum { Opt_gc_background, Opt_disable_roll_forward, Opt_norecovery, Opt_discard, Opt_nodiscard, Opt_noheap, Opt_heap, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_active_logs, Opt_disable_ext_identify, Opt_inline_xattr, Opt_noinline_xattr, Opt_inline_xattr_size, Opt_inline_data, Opt_inline_dentry, Opt_noinline_dentry, Opt_flush_merge, Opt_noflush_merge, Opt_barrier, Opt_nobarrier, Opt_fastboot, Opt_extent_cache, Opt_noextent_cache, Opt_noinline_data, Opt_data_flush, Opt_reserve_root, Opt_resgid, Opt_resuid, Opt_mode, Opt_io_size_bits, Opt_fault_injection, Opt_fault_type, Opt_lazytime, Opt_nolazytime, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_usrjquota, Opt_grpjquota, Opt_prjjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_offprjjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_alloc, Opt_fsync, Opt_test_dummy_encryption, Opt_inlinecrypt, Opt_checkpoint_disable, Opt_checkpoint_disable_cap, Opt_checkpoint_disable_cap_perc, Opt_checkpoint_enable, Opt_checkpoint_merge, Opt_nocheckpoint_merge, Opt_compress_algorithm, Opt_compress_log_size, Opt_compress_extension, Opt_nocompress_extension, Opt_compress_chksum, Opt_compress_mode, Opt_compress_cache, Opt_atgc, Opt_gc_merge, Opt_nogc_merge, Opt_discard_unit, Opt_memory_mode, Opt_age_extent_cache, Opt_errors, Opt_err, }; static match_table_t f2fs_tokens = { {Opt_gc_background, "background_gc=%s"}, {Opt_disable_roll_forward, "disable_roll_forward"}, {Opt_norecovery, "norecovery"}, {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, {Opt_noheap, "no_heap"}, {Opt_heap, "heap"}, {Opt_user_xattr, "user_xattr"}, {Opt_nouser_xattr, "nouser_xattr"}, {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, {Opt_active_logs, "active_logs=%u"}, {Opt_disable_ext_identify, "disable_ext_identify"}, {Opt_inline_xattr, "inline_xattr"}, {Opt_noinline_xattr, "noinline_xattr"}, {Opt_inline_xattr_size, "inline_xattr_size=%u"}, {Opt_inline_data, "inline_data"}, {Opt_inline_dentry, "inline_dentry"}, {Opt_noinline_dentry, "noinline_dentry"}, {Opt_flush_merge, "flush_merge"}, {Opt_noflush_merge, "noflush_merge"}, {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, {Opt_fastboot, "fastboot"}, {Opt_extent_cache, "extent_cache"}, {Opt_noextent_cache, "noextent_cache"}, {Opt_noinline_data, "noinline_data"}, {Opt_data_flush, "data_flush"}, {Opt_reserve_root, "reserve_root=%u"}, {Opt_resgid, "resgid=%u"}, {Opt_resuid, "resuid=%u"}, {Opt_mode, "mode=%s"}, {Opt_io_size_bits, "io_bits=%u"}, {Opt_fault_injection, "fault_injection=%u"}, {Opt_fault_type, "fault_type=%u"}, {Opt_lazytime, "lazytime"}, {Opt_nolazytime, "nolazytime"}, {Opt_quota, "quota"}, {Opt_noquota, "noquota"}, {Opt_usrquota, "usrquota"}, {Opt_grpquota, "grpquota"}, {Opt_prjquota, "prjquota"}, {Opt_usrjquota, "usrjquota=%s"}, {Opt_grpjquota, "grpjquota=%s"}, {Opt_prjjquota, "prjjquota=%s"}, {Opt_offusrjquota, "usrjquota="}, {Opt_offgrpjquota, "grpjquota="}, {Opt_offprjjquota, "prjjquota="}, {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, {Opt_alloc, "alloc_mode=%s"}, {Opt_fsync, "fsync_mode=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption"}, {Opt_inlinecrypt, "inlinecrypt"}, {Opt_checkpoint_disable, "checkpoint=disable"}, {Opt_checkpoint_disable_cap, "checkpoint=disable:%u"}, {Opt_checkpoint_disable_cap_perc, "checkpoint=disable:%u%%"}, {Opt_checkpoint_enable, "checkpoint=enable"}, {Opt_checkpoint_merge, "checkpoint_merge"}, {Opt_nocheckpoint_merge, "nocheckpoint_merge"}, {Opt_compress_algorithm, "compress_algorithm=%s"}, {Opt_compress_log_size, "compress_log_size=%u"}, {Opt_compress_extension, "compress_extension=%s"}, {Opt_nocompress_extension, "nocompress_extension=%s"}, {Opt_compress_chksum, "compress_chksum"}, {Opt_compress_mode, "compress_mode=%s"}, {Opt_compress_cache, "compress_cache"}, {Opt_atgc, "atgc"}, {Opt_gc_merge, "gc_merge"}, {Opt_nogc_merge, "nogc_merge"}, {Opt_discard_unit, "discard_unit=%s"}, {Opt_memory_mode, "memory=%s"}, {Opt_age_extent_cache, "age_extent_cache"}, {Opt_errors, "errors=%s"}, {Opt_err, NULL}, }; void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...) { struct va_format vaf; va_list args; int level; va_start(args, fmt); level = printk_get_level(fmt); vaf.fmt = printk_skip_level(fmt); vaf.va = &args; printk("%c%cF2FS-fs (%s): %pV\n", KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf); va_end(args); } #if IS_ENABLED(CONFIG_UNICODE) static const struct f2fs_sb_encodings { __u16 magic; char *name; unsigned int version; } f2fs_sb_encoding_map[] = { {F2FS_ENC_UTF8_12_1, "utf8", UNICODE_AGE(12, 1, 0)}, }; static const struct f2fs_sb_encodings * f2fs_sb_read_encoding(const struct f2fs_super_block *sb) { __u16 magic = le16_to_cpu(sb->s_encoding); int i; for (i = 0; i < ARRAY_SIZE(f2fs_sb_encoding_map); i++) if (magic == f2fs_sb_encoding_map[i].magic) return &f2fs_sb_encoding_map[i]; return NULL; } struct kmem_cache *f2fs_cf_name_slab; static int __init f2fs_create_casefold_cache(void) { f2fs_cf_name_slab = f2fs_kmem_cache_create("f2fs_casefolded_name", F2FS_NAME_LEN); return f2fs_cf_name_slab ? 0 : -ENOMEM; } static void f2fs_destroy_casefold_cache(void) { kmem_cache_destroy(f2fs_cf_name_slab); } #else static int __init f2fs_create_casefold_cache(void) { return 0; } static void f2fs_destroy_casefold_cache(void) { } #endif static inline void limit_reserve_root(struct f2fs_sb_info *sbi) { block_t limit = min((sbi->user_block_count >> 3), sbi->user_block_count - sbi->reserved_blocks); /* limit is 12.5% */ if (test_opt(sbi, RESERVE_ROOT) && F2FS_OPTION(sbi).root_reserved_blocks > limit) { F2FS_OPTION(sbi).root_reserved_blocks = limit; f2fs_info(sbi, "Reduce reserved blocks for root = %u", F2FS_OPTION(sbi).root_reserved_blocks); } if (!test_opt(sbi, RESERVE_ROOT) && (!uid_eq(F2FS_OPTION(sbi).s_resuid, make_kuid(&init_user_ns, F2FS_DEF_RESUID)) || !gid_eq(F2FS_OPTION(sbi).s_resgid, make_kgid(&init_user_ns, F2FS_DEF_RESGID)))) f2fs_info(sbi, "Ignore s_resuid=%u, s_resgid=%u w/o reserve_root", from_kuid_munged(&init_user_ns, F2FS_OPTION(sbi).s_resuid), from_kgid_munged(&init_user_ns, F2FS_OPTION(sbi).s_resgid)); } static inline int adjust_reserved_segment(struct f2fs_sb_info *sbi) { unsigned int sec_blks = sbi->blocks_per_seg * sbi->segs_per_sec; unsigned int avg_vblocks; unsigned int wanted_reserved_segments; block_t avail_user_block_count; if (!F2FS_IO_ALIGNED(sbi)) return 0; /* average valid block count in section in worst case */ avg_vblocks = sec_blks / F2FS_IO_SIZE(sbi); /* * we need enough free space when migrating one section in worst case */ wanted_reserved_segments = (F2FS_IO_SIZE(sbi) / avg_vblocks) * reserved_segments(sbi); wanted_reserved_segments -= reserved_segments(sbi); avail_user_block_count = sbi->user_block_count - sbi->current_reserved_blocks - F2FS_OPTION(sbi).root_reserved_blocks; if (wanted_reserved_segments * sbi->blocks_per_seg > avail_user_block_count) { f2fs_err(sbi, "IO align feature can't grab additional reserved segment: %u, available segments: %u", wanted_reserved_segments, avail_user_block_count >> sbi->log_blocks_per_seg); return -ENOSPC; } SM_I(sbi)->additional_reserved_segments = wanted_reserved_segments; f2fs_info(sbi, "IO align feature needs additional reserved segment: %u", wanted_reserved_segments); return 0; } static inline void adjust_unusable_cap_perc(struct f2fs_sb_info *sbi) { if (!F2FS_OPTION(sbi).unusable_cap_perc) return; if (F2FS_OPTION(sbi).unusable_cap_perc == 100) F2FS_OPTION(sbi).unusable_cap = sbi->user_block_count; else F2FS_OPTION(sbi).unusable_cap = (sbi->user_block_count / 100) * F2FS_OPTION(sbi).unusable_cap_perc; f2fs_info(sbi, "Adjust unusable cap for checkpoint=disable = %u / %u%%", F2FS_OPTION(sbi).unusable_cap, F2FS_OPTION(sbi).unusable_cap_perc); } static void init_once(void *foo) { struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo; inode_init_once(&fi->vfs_inode); } #ifdef CONFIG_QUOTA static const char * const quotatypes[] = INITQFNAMES; #define QTYPE2NAME(t) (quotatypes[t]) static int f2fs_set_qf_name(struct super_block *sb, int qtype, substring_t *args) { struct f2fs_sb_info *sbi = F2FS_SB(sb); char *qname; int ret = -EINVAL; if (sb_any_quota_loaded(sb) && !F2FS_OPTION(sbi).s_qf_names[qtype]) { f2fs_err(sbi, "Cannot change journaled quota options when quota turned on"); return -EINVAL; } if (f2fs_sb_has_quota_ino(sbi)) { f2fs_info(sbi, "QUOTA feature is enabled, so ignore qf_name"); return 0; } qname = match_strdup(args); if (!qname) { f2fs_err(sbi, "Not enough memory for storing quotafile name"); return -ENOMEM; } if (F2FS_OPTION(sbi).s_qf_names[qtype]) { if (strcmp(F2FS_OPTION(sbi).s_qf_names[qtype], qname) == 0) ret = 0; else f2fs_err(sbi, "%s quota file already specified", QTYPE2NAME(qtype)); goto errout; } if (strchr(qname, '/')) { f2fs_err(sbi, "quotafile must be on filesystem root"); goto errout; } F2FS_OPTION(sbi).s_qf_names[qtype] = qname; set_opt(sbi, QUOTA); return 0; errout: kfree(qname); return ret; } static int f2fs_clear_qf_name(struct super_block *sb, int qtype) { struct f2fs_sb_info *sbi = F2FS_SB(sb); if (sb_any_quota_loaded(sb) && F2FS_OPTION(sbi).s_qf_names[qtype]) { f2fs_err(sbi, "Cannot change journaled quota options when quota turned on"); return -EINVAL; } kfree(F2FS_OPTION(sbi).s_qf_names[qtype]); F2FS_OPTION(sbi).s_qf_names[qtype] = NULL; return 0; } static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) { /* * We do the test below only for project quotas. 'usrquota' and * 'grpquota' mount options are allowed even without quota feature * to support legacy quotas in quota files. */ if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi)) { f2fs_err(sbi, "Project quota feature not enabled. Cannot enable project quota enforcement."); return -1; } if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] || F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] || F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) { if (test_opt(sbi, USRQUOTA) && F2FS_OPTION(sbi).s_qf_names[USRQUOTA]) clear_opt(sbi, USRQUOTA); if (test_opt(sbi, GRPQUOTA) && F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]) clear_opt(sbi, GRPQUOTA); if (test_opt(sbi, PRJQUOTA) && F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) clear_opt(sbi, PRJQUOTA); if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) || test_opt(sbi, PRJQUOTA)) { f2fs_err(sbi, "old and new quota format mixing"); return -1; } if (!F2FS_OPTION(sbi).s_jquota_fmt) { f2fs_err(sbi, "journaled quota format not specified"); return -1; } } if (f2fs_sb_has_quota_ino(sbi) && F2FS_OPTION(sbi).s_jquota_fmt) { f2fs_info(sbi, "QUOTA feature is enabled, so ignore jquota_fmt"); F2FS_OPTION(sbi).s_jquota_fmt = 0; } return 0; } #endif static int f2fs_set_test_dummy_encryption(struct super_block *sb, const char *opt, const substring_t *arg, bool is_remount) { struct f2fs_sb_info *sbi = F2FS_SB(sb); struct fs_parameter param = { .type = fs_value_is_string, .string = arg->from ? arg->from : "", }; struct fscrypt_dummy_policy *policy = &F2FS_OPTION(sbi).dummy_enc_policy; int err; if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) { f2fs_warn(sbi, "test_dummy_encryption option not supported"); return -EINVAL; } if (!f2fs_sb_has_encrypt(sbi)) { f2fs_err(sbi, "Encrypt feature is off"); return -EINVAL; } /* * This mount option is just for testing, and it's not worthwhile to * implement the extra complexity (e.g. RCU protection) that would be * needed to allow it to be set or changed during remount. We do allow * it to be specified during remount, but only if there is no change. */ if (is_remount && !fscrypt_is_dummy_policy_set(policy)) { f2fs_warn(sbi, "Can't set test_dummy_encryption on remount"); return -EINVAL; } err = fscrypt_parse_test_dummy_encryption(&param, policy); if (err) { if (err == -EEXIST) f2fs_warn(sbi, "Can't change test_dummy_encryption on remount"); else if (err == -EINVAL) f2fs_warn(sbi, "Value of option \"%s\" is unrecognized", opt); else f2fs_warn(sbi, "Error processing option \"%s\" [%d]", opt, err); return -EINVAL; } f2fs_warn(sbi, "Test dummy encryption mode enabled"); return 0; } #ifdef CONFIG_F2FS_FS_COMPRESSION static bool is_compress_extension_exist(struct f2fs_sb_info *sbi, const char *new_ext, bool is_ext) { unsigned char (*ext)[F2FS_EXTENSION_LEN]; int ext_cnt; int i; if (is_ext) { ext = F2FS_OPTION(sbi).extensions; ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; } else { ext = F2FS_OPTION(sbi).noextensions; ext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; } for (i = 0; i < ext_cnt; i++) { if (!strcasecmp(new_ext, ext[i])) return true; } return false; } /* * 1. The same extension name cannot not appear in both compress and non-compress extension * at the same time. * 2. If the compress extension specifies all files, the types specified by the non-compress * extension will be treated as special cases and will not be compressed. * 3. Don't allow the non-compress extension specifies all files. */ static int f2fs_test_compress_extension(struct f2fs_sb_info *sbi) { unsigned char (*ext)[F2FS_EXTENSION_LEN]; unsigned char (*noext)[F2FS_EXTENSION_LEN]; int ext_cnt, noext_cnt, index = 0, no_index = 0; ext = F2FS_OPTION(sbi).extensions; ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; noext = F2FS_OPTION(sbi).noextensions; noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; if (!noext_cnt) return 0; for (no_index = 0; no_index < noext_cnt; no_index++) { if (!strcasecmp("*", noext[no_index])) { f2fs_info(sbi, "Don't allow the nocompress extension specifies all files"); return -EINVAL; } for (index = 0; index < ext_cnt; index++) { if (!strcasecmp(ext[index], noext[no_index])) { f2fs_info(sbi, "Don't allow the same extension %s appear in both compress and nocompress extension", ext[index]); return -EINVAL; } } } return 0; } #ifdef CONFIG_F2FS_FS_LZ4 static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) { #ifdef CONFIG_F2FS_FS_LZ4HC unsigned int level; if (strlen(str) == 3) { F2FS_OPTION(sbi).compress_level = 0; return 0; } str += 3; if (str[0] != ':') { f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>"); return -EINVAL; } if (kstrtouint(str + 1, 10, &level)) return -EINVAL; if (!f2fs_is_compress_level_valid(COMPRESS_LZ4, level)) { f2fs_info(sbi, "invalid lz4hc compress level: %d", level); return -EINVAL; } F2FS_OPTION(sbi).compress_level = level; return 0; #else if (strlen(str) == 3) { F2FS_OPTION(sbi).compress_level = 0; return 0; } f2fs_info(sbi, "kernel doesn't support lz4hc compression"); return -EINVAL; #endif } #endif #ifdef CONFIG_F2FS_FS_ZSTD static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) { unsigned int level; int len = 4; if (strlen(str) == len) { F2FS_OPTION(sbi).compress_level = F2FS_ZSTD_DEFAULT_CLEVEL; return 0; } str += len; if (str[0] != ':') { f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>"); return -EINVAL; } if (kstrtouint(str + 1, 10, &level)) return -EINVAL; if (!f2fs_is_compress_level_valid(COMPRESS_ZSTD, level)) { f2fs_info(sbi, "invalid zstd compress level: %d", level); return -EINVAL; } F2FS_OPTION(sbi).compress_level = level; return 0; } #endif #endif static int parse_options(struct super_block *sb, char *options, bool is_remount) { struct f2fs_sb_info *sbi = F2FS_SB(sb); substring_t args[MAX_OPT_ARGS]; #ifdef CONFIG_F2FS_FS_COMPRESSION unsigned char (*ext)[F2FS_EXTENSION_LEN]; unsigned char (*noext)[F2FS_EXTENSION_LEN]; int ext_cnt, noext_cnt; #endif char *p, *name; int arg = 0; kuid_t uid; kgid_t gid; int ret; if (!options) goto default_check; while ((p = strsep(&options, ",")) != NULL) { int token; if (!*p) continue; /* * Initialize args struct so we know whether arg was * found; some options take optional arguments. */ args[0].to = args[0].from = NULL; token = match_token(p, f2fs_tokens, args); switch (token) { case Opt_gc_background: name = match_strdup(&args[0]); if (!name) return -ENOMEM; if (!strcmp(name, "on")) { F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON; } else if (!strcmp(name, "off")) { F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_OFF; } else if (!strcmp(name, "sync")) { F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_SYNC; } else { kfree(name); return -EINVAL; } kfree(name); break; case Opt_disable_roll_forward: set_opt(sbi, DISABLE_ROLL_FORWARD); break; case Opt_norecovery: /* this option mounts f2fs with ro */ set_opt(sbi, NORECOVERY); if (!f2fs_readonly(sb)) return -EINVAL; break; case Opt_discard: if (!f2fs_hw_support_discard(sbi)) { f2fs_warn(sbi, "device does not support discard"); break; } set_opt(sbi, DISCARD); break; case Opt_nodiscard: if (f2fs_hw_should_discard(sbi)) { f2fs_warn(sbi, "discard is required for zoned block devices"); return -EINVAL; } clear_opt(sbi, DISCARD); break; case Opt_noheap: set_opt(sbi, NOHEAP); break; case Opt_heap: clear_opt(sbi, NOHEAP); break; #ifdef CONFIG_F2FS_FS_XATTR case Opt_user_xattr: set_opt(sbi, XATTR_USER); break; case Opt_nouser_xattr: clear_opt(sbi, XATTR_USER); break; case Opt_inline_xattr: set_opt(sbi, INLINE_XATTR); break; case Opt_noinline_xattr: clear_opt(sbi, INLINE_XATTR); break; case Opt_inline_xattr_size: if (args->from && match_int(args, &arg)) return -EINVAL; set_opt(sbi, INLINE_XATTR_SIZE); F2FS_OPTION(sbi).inline_xattr_size = arg; break; #else case Opt_user_xattr: f2fs_info(sbi, "user_xattr options not supported"); break; case Opt_nouser_xattr: f2fs_info(sbi, "nouser_xattr options not supported"); break; case Opt_inline_xattr: f2fs_info(sbi, "inline_xattr options not supported"); break; case Opt_noinline_xattr: f2fs_info(sbi, "noinline_xattr options not supported"); break; #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL case Opt_acl: set_opt(sbi, POSIX_ACL); break; case Opt_noacl: clear_opt(sbi, POSIX_ACL); break; #else case Opt_acl: f2fs_info(sbi, "acl options not supported"); break; case Opt_noacl: f2fs_info(sbi, "noacl options not supported"); break; #endif case Opt_active_logs: if (args->from && match_int(args, &arg)) return -EINVAL; if (arg != 2 && arg != 4 && arg != NR_CURSEG_PERSIST_TYPE) return -EINVAL; F2FS_OPTION(sbi).active_logs = arg; break; case Opt_disable_ext_identify: set_opt(sbi, DISABLE_EXT_IDENTIFY); break; case Opt_inline_data: set_opt(sbi, INLINE_DATA); break; case Opt_inline_dentry: set_opt(sbi, INLINE_DENTRY); break; case Opt_noinline_dentry: clear_opt(sbi, INLINE_DENTRY); break; case Opt_flush_merge: set_opt(sbi, FLUSH_MERGE); break; case Opt_noflush_merge: clear_opt(sbi, FLUSH_MERGE); break; case Opt_nobarrier: set_opt(sbi, NOBARRIER); break; case Opt_barrier: clear_opt(sbi, NOBARRIER); break; case Opt_fastboot: set_opt(sbi, FASTBOOT); break; case Opt_extent_cache: set_opt(sbi, READ_EXTENT_CACHE); break; case Opt_noextent_cache: clear_opt(sbi, READ_EXTENT_CACHE); break; case Opt_noinline_data: clear_opt(sbi, INLINE_DATA); break; case Opt_data_flush: set_opt(sbi, DATA_FLUSH); break; case Opt_reserve_root: if (args->from && match_int(args, &arg)) return -EINVAL; if (test_opt(sbi, RESERVE_ROOT)) { f2fs_info(sbi, "Preserve previous reserve_root=%u", F2FS_OPTION(sbi).root_reserved_blocks); } else { F2FS_OPTION(sbi).root_reserved_blocks = arg; set_opt(sbi, RESERVE_ROOT); } break; case Opt_resuid: if (args->from && match_int(args, &arg)) return -EINVAL; uid = make_kuid(current_user_ns(), arg); if (!uid_valid(uid)) { f2fs_err(sbi, "Invalid uid value %d", arg); return -EINVAL; } F2FS_OPTION(sbi).s_resuid = uid; break; case Opt_resgid: if (args->from && match_int(args, &arg)) return -EINVAL; gid = make_kgid(current_user_ns(), arg); if (!gid_valid(gid)) { f2fs_err(sbi, "Invalid gid value %d", arg); return -EINVAL; } F2FS_OPTION(sbi).s_resgid = gid; break; case Opt_mode: name = match_strdup(&args[0]); if (!name) return -ENOMEM; if (!strcmp(name, "adaptive")) { F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; } else if (!strcmp(name, "lfs")) { F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; } else if (!strcmp(name, "fragment:segment")) { F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_SEG; } else if (!strcmp(name, "fragment:block")) { F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_BLK; } else { kfree(name); return -EINVAL; } kfree(name); break; case Opt_io_size_bits: if (args->from && match_int(args, &arg)) return -EINVAL; if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_VECS)) { f2fs_warn(sbi, "Not support %ld, larger than %d", BIT(arg), BIO_MAX_VECS); return -EINVAL; } F2FS_OPTION(sbi).write_io_size_bits = arg; break; #ifdef CONFIG_F2FS_FAULT_INJECTION case Opt_fault_injection: if (args->from && match_int(args, &arg)) return -EINVAL; f2fs_build_fault_attr(sbi, arg, F2FS_ALL_FAULT_TYPE); set_opt(sbi, FAULT_INJECTION); break; case Opt_fault_type: if (args->from && match_int(args, &arg)) return -EINVAL; f2fs_build_fault_attr(sbi, 0, arg); set_opt(sbi, FAULT_INJECTION); break; #else case Opt_fault_injection: f2fs_info(sbi, "fault_injection options not supported"); break; case Opt_fault_type: f2fs_info(sbi, "fault_type options not supported"); break; #endif case Opt_lazytime: sb->s_flags |= SB_LAZYTIME; break; case Opt_nolazytime: sb->s_flags &= ~SB_LAZYTIME; break; #ifdef CONFIG_QUOTA case Opt_quota: case Opt_usrquota: set_opt(sbi, USRQUOTA); break; case Opt_grpquota: set_opt(sbi, GRPQUOTA); break; case Opt_prjquota: set_opt(sbi, PRJQUOTA); break; case Opt_usrjquota: ret = f2fs_set_qf_name(sb, USRQUOTA, &args[0]); if (ret) return ret; break; case Opt_grpjquota: ret = f2fs_set_qf_name(sb, GRPQUOTA, &args[0]); if (ret) return ret; break; case Opt_prjjquota: ret = f2fs_set_qf_name(sb, PRJQUOTA, &args[0]); if (ret) return ret; break; case Opt_offusrjquota: ret = f2fs_clear_qf_name(sb, USRQUOTA); if (ret) return ret; break; case Opt_offgrpjquota: ret = f2fs_clear_qf_name(sb, GRPQUOTA); if (ret) return ret; break; case Opt_offprjjquota: ret = f2fs_clear_qf_name(sb, PRJQUOTA); if (ret) return ret; break; case Opt_jqfmt_vfsold: F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_OLD; break; case Opt_jqfmt_vfsv0: F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V0; break; case Opt_jqfmt_vfsv1: F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V1; break; case Opt_noquota: clear_opt(sbi, QUOTA); clear_opt(sbi, USRQUOTA); clear_opt(sbi, GRPQUOTA); clear_opt(sbi, PRJQUOTA); break; #else case Opt_quota: case Opt_usrquota: case Opt_grpquota: case Opt_prjquota: case Opt_usrjquota: case Opt_grpjquota: case Opt_prjjquota: case Opt_offusrjquota: case Opt_offgrpjquota: case Opt_offprjjquota: case Opt_jqfmt_vfsold: case Opt_jqfmt_vfsv0: case Opt_jqfmt_vfsv1: case Opt_noquota: f2fs_info(sbi, "quota operations not supported"); break; #endif case Opt_alloc: name = match_strdup(&args[0]); if (!name) return -ENOMEM; if (!strcmp(name, "default")) { F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; } else if (!strcmp(name, "reuse")) { F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; } else { kfree(name); return -EINVAL; } kfree(name); break; case Opt_fsync: name = match_strdup(&args[0]); if (!name) return -ENOMEM; if (!strcmp(name, "posix")) { F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; } else if (!strcmp(name, "strict")) { F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_STRICT; } else if (!strcmp(name, "nobarrier")) { F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_NOBARRIER; } else { kfree(name); return -EINVAL; } kfree(name); break; case Opt_test_dummy_encryption: ret = f2fs_set_test_dummy_encryption(sb, p, &args[0], is_remount); if (ret) return ret; break; case Opt_inlinecrypt: #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT sb->s_flags |= SB_INLINECRYPT; #else f2fs_info(sbi, "inline encryption not supported"); #endif break; case Opt_checkpoint_disable_cap_perc: if (args->from && match_int(args, &arg)) return -EINVAL; if (arg < 0 || arg > 100) return -EINVAL; F2FS_OPTION(sbi).unusable_cap_perc = arg; set_opt(sbi, DISABLE_CHECKPOINT); break; case Opt_checkpoint_disable_cap: if (args->from && match_int(args, &arg)) return -EINVAL; F2FS_OPTION(sbi).unusable_cap = arg; set_opt(sbi, DISABLE_CHECKPOINT); break; case Opt_checkpoint_disable: set_opt(sbi, DISABLE_CHECKPOINT); break; case Opt_checkpoint_enable: clear_opt(sbi, DISABLE_CHECKPOINT); break; case Opt_checkpoint_merge: set_opt(sbi, MERGE_CHECKPOINT); break; case Opt_nocheckpoint_merge: clear_opt(sbi, MERGE_CHECKPOINT); break; #ifdef CONFIG_F2FS_FS_COMPRESSION case Opt_compress_algorithm: if (!f2fs_sb_has_compression(sbi)) { f2fs_info(sbi, "Image doesn't support compression"); break; } name = match_strdup(&args[0]); if (!name) return -ENOMEM; if (!strcmp(name, "lzo")) { #ifdef CONFIG_F2FS_FS_LZO F2FS_OPTION(sbi).compress_level = 0; F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZO; #else f2fs_info(sbi, "kernel doesn't support lzo compression"); #endif } else if (!strncmp(name, "lz4", 3)) { #ifdef CONFIG_F2FS_FS_LZ4 ret = f2fs_set_lz4hc_level(sbi, name); if (ret) { kfree(name); return -EINVAL; } F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; #else f2fs_info(sbi, "kernel doesn't support lz4 compression"); #endif } else if (!strncmp(name, "zstd", 4)) { #ifdef CONFIG_F2FS_FS_ZSTD ret = f2fs_set_zstd_level(sbi, name); if (ret) { kfree(name); return -EINVAL; } F2FS_OPTION(sbi).compress_algorithm = COMPRESS_ZSTD; #else f2fs_info(sbi, "kernel doesn't support zstd compression"); #endif } else if (!strcmp(name, "lzo-rle")) { #ifdef CONFIG_F2FS_FS_LZORLE F2FS_OPTION(sbi).compress_level = 0; F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZORLE; #else f2fs_info(sbi, "kernel doesn't support lzorle compression"); #endif } else { kfree(name); return -EINVAL; } kfree(name); break; case Opt_compress_log_size: if (!f2fs_sb_has_compression(sbi)) { f2fs_info(sbi, "Image doesn't support compression"); break; } if (args->from && match_int(args, &arg)) return -EINVAL; if (arg < MIN_COMPRESS_LOG_SIZE || arg > MAX_COMPRESS_LOG_SIZE) { f2fs_err(sbi, "Compress cluster log size is out of range"); return -EINVAL; } F2FS_OPTION(sbi).compress_log_size = arg; break; case Opt_compress_extension: if (!f2fs_sb_has_compression(sbi)) { f2fs_info(sbi, "Image doesn't support compression"); break; } name = match_strdup(&args[0]); if (!name) return -ENOMEM; ext = F2FS_OPTION(sbi).extensions; ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; if (strlen(name) >= F2FS_EXTENSION_LEN || ext_cnt >= COMPRESS_EXT_NUM) { f2fs_err(sbi, "invalid extension length/number"); kfree(name); return -EINVAL; } if (is_compress_extension_exist(sbi, name, true)) { kfree(name); break; } strcpy(ext[ext_cnt], name); F2FS_OPTION(sbi).compress_ext_cnt++; kfree(name); break; case Opt_nocompress_extension: if (!f2fs_sb_has_compression(sbi)) { f2fs_info(sbi, "Image doesn't support compression"); break; } name = match_strdup(&args[0]); if (!name) return -ENOMEM; noext = F2FS_OPTION(sbi).noextensions; noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; if (strlen(name) >= F2FS_EXTENSION_LEN || noext_cnt >= COMPRESS_EXT_NUM) { f2fs_err(sbi, "invalid extension length/number"); kfree(name); return -EINVAL; } if (is_compress_extension_exist(sbi, name, false)) { kfree(name); break; } strcpy(noext[noext_cnt], name); F2FS_OPTION(sbi).nocompress_ext_cnt++; kfree(name); break; case Opt_compress_chksum: if (!f2fs_sb_has_compression(sbi)) { f2fs_info(sbi, "Image doesn't support compression"); break; } F2FS_OPTION(sbi).compress_chksum = true; break; case Opt_compress_mode: if (!f2fs_sb_has_compression(sbi)) { f2fs_info(sbi, "Image doesn't support compression"); break; } name = match_strdup(&args[0]); if (!name) return -ENOMEM; if (!strcmp(name, "fs")) { F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS; } else if (!strcmp(name, "user")) { F2FS_OPTION(sbi).compress_mode = COMPR_MODE_USER; } else { kfree(name); return -EINVAL; } kfree(name); break; case Opt_compress_cache: if (!f2fs_sb_has_compression(sbi)) { f2fs_info(sbi, "Image doesn't support compression"); break; } set_opt(sbi, COMPRESS_CACHE); break; #else case Opt_compress_algorithm: case Opt_compress_log_size: case Opt_compress_extension: case Opt_nocompress_extension: case Opt_compress_chksum: case Opt_compress_mode: case Opt_compress_cache: f2fs_info(sbi, "compression options not supported"); break; #endif case Opt_atgc: set_opt(sbi, ATGC); break; case Opt_gc_merge: set_opt(sbi, GC_MERGE); break; case Opt_nogc_merge: clear_opt(sbi, GC_MERGE); break; case Opt_discard_unit: name = match_strdup(&args[0]); if (!name) return -ENOMEM; if (!strcmp(name, "block")) { F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_BLOCK; } else if (!strcmp(name, "segment")) { F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_SEGMENT; } else if (!strcmp(name, "section")) { F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_SECTION; } else { kfree(name); return -EINVAL; } kfree(name); break; case Opt_memory_mode: name = match_strdup(&args[0]); if (!name) return -ENOMEM; if (!strcmp(name, "normal")) { F2FS_OPTION(sbi).memory_mode = MEMORY_MODE_NORMAL; } else if (!strcmp(name, "low")) { F2FS_OPTION(sbi).memory_mode = MEMORY_MODE_LOW; } else { kfree(name); return -EINVAL; } kfree(name); break; case Opt_age_extent_cache: set_opt(sbi, AGE_EXTENT_CACHE); break; case Opt_errors: name = match_strdup(&args[0]); if (!name) return -ENOMEM; if (!strcmp(name, "remount-ro")) { F2FS_OPTION(sbi).errors = MOUNT_ERRORS_READONLY; } else if (!strcmp(name, "continue")) { F2FS_OPTION(sbi).errors = MOUNT_ERRORS_CONTINUE; } else if (!strcmp(name, "panic")) { F2FS_OPTION(sbi).errors = MOUNT_ERRORS_PANIC; } else { kfree(name); return -EINVAL; } kfree(name); break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); return -EINVAL; } } default_check: #ifdef CONFIG_QUOTA if (f2fs_check_quota_options(sbi)) return -EINVAL; #else if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sbi->sb)) { f2fs_info(sbi, "Filesystem with quota feature cannot be mounted RDWR without CONFIG_QUOTA"); return -EINVAL; } if (f2fs_sb_has_project_quota(sbi) && !f2fs_readonly(sbi->sb)) { f2fs_err(sbi, "Filesystem with project quota feature cannot be mounted RDWR without CONFIG_QUOTA"); return -EINVAL; } #endif #if !IS_ENABLED(CONFIG_UNICODE) if (f2fs_sb_has_casefold(sbi)) { f2fs_err(sbi, "Filesystem with casefold feature cannot be mounted without CONFIG_UNICODE"); return -EINVAL; } #endif /* * The BLKZONED feature indicates that the drive was formatted with * zone alignment optimization. This is optional for host-aware * devices, but mandatory for host-managed zoned block devices. */ if (f2fs_sb_has_blkzoned(sbi)) { #ifdef CONFIG_BLK_DEV_ZONED if (F2FS_OPTION(sbi).discard_unit != DISCARD_UNIT_SECTION) { f2fs_info(sbi, "Zoned block device doesn't need small discard, set discard_unit=section by default"); F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_SECTION; } if (F2FS_OPTION(sbi).fs_mode != FS_MODE_LFS) { f2fs_info(sbi, "Only lfs mode is allowed with zoned block device feature"); return -EINVAL; } #else f2fs_err(sbi, "Zoned block device support is not enabled"); return -EINVAL; #endif } #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_test_compress_extension(sbi)) { f2fs_err(sbi, "invalid compress or nocompress extension"); return -EINVAL; } #endif if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) { f2fs_err(sbi, "Should set mode=lfs with %luKB-sized IO", F2FS_IO_SIZE_KB(sbi)); return -EINVAL; } if (test_opt(sbi, INLINE_XATTR_SIZE)) { int min_size, max_size; if (!f2fs_sb_has_extra_attr(sbi) || !f2fs_sb_has_flexible_inline_xattr(sbi)) { f2fs_err(sbi, "extra_attr or flexible_inline_xattr feature is off"); return -EINVAL; } if (!test_opt(sbi, INLINE_XATTR)) { f2fs_err(sbi, "inline_xattr_size option should be set with inline_xattr option"); return -EINVAL; } min_size = MIN_INLINE_XATTR_SIZE; max_size = MAX_INLINE_XATTR_SIZE; if (F2FS_OPTION(sbi).inline_xattr_size < min_size || F2FS_OPTION(sbi).inline_xattr_size > max_size) { f2fs_err(sbi, "inline xattr size is out of range: %d ~ %d", min_size, max_size); return -EINVAL; } } if (test_opt(sbi, ATGC) && f2fs_lfs_mode(sbi)) { f2fs_err(sbi, "LFS is not compatible with ATGC"); return -EINVAL; } if (f2fs_is_readonly(sbi) && test_opt(sbi, FLUSH_MERGE)) { f2fs_err(sbi, "FLUSH_MERGE not compatible with readonly mode"); return -EINVAL; } if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) { f2fs_err(sbi, "Allow to mount readonly mode only"); return -EROFS; } return 0; } static struct inode *f2fs_alloc_inode(struct super_block *sb) { struct f2fs_inode_info *fi; if (time_to_inject(F2FS_SB(sb), FAULT_SLAB_ALLOC)) return NULL; fi = alloc_inode_sb(sb, f2fs_inode_cachep, GFP_F2FS_ZERO); if (!fi) return NULL; init_once((void *) fi); /* Initialize f2fs-specific inode info */ atomic_set(&fi->dirty_pages, 0); atomic_set(&fi->i_compr_blocks, 0); init_f2fs_rwsem(&fi->i_sem); spin_lock_init(&fi->i_size_lock); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); init_f2fs_rwsem(&fi->i_gc_rwsem[READ]); init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]); init_f2fs_rwsem(&fi->i_xattr_sem); /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; return &fi->vfs_inode; } static int f2fs_drop_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int ret; /* * during filesystem shutdown, if checkpoint is disabled, * drop useless meta/node dirty pages. */ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { if (inode->i_ino == F2FS_NODE_INO(sbi) || inode->i_ino == F2FS_META_INO(sbi)) { trace_f2fs_drop_inode(inode, 1); return 1; } } /* * This is to avoid a deadlock condition like below. * writeback_single_inode(inode) * - f2fs_write_data_page * - f2fs_gc -> iput -> evict * - inode_wait_for_writeback(inode) */ if ((!inode_unhashed(inode) && inode->i_state & I_SYNC)) { if (!inode->i_nlink && !is_bad_inode(inode)) { /* to avoid evict_inode call simultaneously */ atomic_inc(&inode->i_count); spin_unlock(&inode->i_lock); /* should remain fi->extent_tree for writepage */ f2fs_destroy_extent_node(inode); sb_start_intwrite(inode->i_sb); f2fs_i_size_write(inode, 0); f2fs_submit_merged_write_cond(F2FS_I_SB(inode), inode, NULL, 0, DATA); truncate_inode_pages_final(inode->i_mapping); if (F2FS_HAS_BLOCKS(inode)) f2fs_truncate(inode); sb_end_intwrite(inode->i_sb); spin_lock(&inode->i_lock); atomic_dec(&inode->i_count); } trace_f2fs_drop_inode(inode, 0); return 0; } ret = generic_drop_inode(inode); if (!ret) ret = fscrypt_drop_inode(inode); trace_f2fs_drop_inode(inode, ret); return ret; } int f2fs_inode_dirtied(struct inode *inode, bool sync) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int ret = 0; spin_lock(&sbi->inode_lock[DIRTY_META]); if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { ret = 1; } else { set_inode_flag(inode, FI_DIRTY_INODE); stat_inc_dirty_inode(sbi, DIRTY_META); } if (sync && list_empty(&F2FS_I(inode)->gdirty_list)) { list_add_tail(&F2FS_I(inode)->gdirty_list, &sbi->inode_list[DIRTY_META]); inc_page_count(sbi, F2FS_DIRTY_IMETA); } spin_unlock(&sbi->inode_lock[DIRTY_META]); return ret; } void f2fs_inode_synced(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); spin_lock(&sbi->inode_lock[DIRTY_META]); if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) { spin_unlock(&sbi->inode_lock[DIRTY_META]); return; } if (!list_empty(&F2FS_I(inode)->gdirty_list)) { list_del_init(&F2FS_I(inode)->gdirty_list); dec_page_count(sbi, F2FS_DIRTY_IMETA); } clear_inode_flag(inode, FI_DIRTY_INODE); clear_inode_flag(inode, FI_AUTO_RECOVER); stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META); spin_unlock(&sbi->inode_lock[DIRTY_META]); } /* * f2fs_dirty_inode() is called from __mark_inode_dirty() * * We should call set_dirty_inode to write the dirty inode through write_inode. */ static void f2fs_dirty_inode(struct inode *inode, int flags) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); if (inode->i_ino == F2FS_NODE_INO(sbi) || inode->i_ino == F2FS_META_INO(sbi)) return; if (is_inode_flag_set(inode, FI_AUTO_RECOVER)) clear_inode_flag(inode, FI_AUTO_RECOVER); f2fs_inode_dirtied(inode, false); } static void f2fs_free_inode(struct inode *inode) { fscrypt_free_inode(inode); kmem_cache_free(f2fs_inode_cachep, F2FS_I(inode)); } static void destroy_percpu_info(struct f2fs_sb_info *sbi) { percpu_counter_destroy(&sbi->total_valid_inode_count); percpu_counter_destroy(&sbi->rf_node_block_count); percpu_counter_destroy(&sbi->alloc_valid_block_count); } static void destroy_device_list(struct f2fs_sb_info *sbi) { int i; for (i = 0; i < sbi->s_ndevs; i++) { if (i > 0) bdev_release(FDEV(i).bdev_handle); #ifdef CONFIG_BLK_DEV_ZONED kvfree(FDEV(i).blkz_seq); #endif } kvfree(sbi->devs); } static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); int i; int err = 0; bool done; /* unregister procfs/sysfs entries in advance to avoid race case */ f2fs_unregister_sysfs(sbi); f2fs_quota_off_umount(sb); /* prevent remaining shrinker jobs */ mutex_lock(&sbi->umount_mutex); /* * flush all issued checkpoints and stop checkpoint issue thread. * after then, all checkpoints should be done by each process context. */ f2fs_stop_ckpt_thread(sbi); /* * We don't need to do checkpoint when superblock is clean. * But, the previous checkpoint was not done by umount, it needs to do * clean checkpoint again. */ if ((is_sbi_flag_set(sbi, SBI_IS_DIRTY) || !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG))) { struct cp_control cpc = { .reason = CP_UMOUNT, }; stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_write_checkpoint(sbi, &cpc); } /* be sure to wait for any on-going discard commands */ done = f2fs_issue_discard_timeout(sbi); if (f2fs_realtime_discard_enable(sbi) && !sbi->discard_blks && done) { struct cp_control cpc = { .reason = CP_UMOUNT | CP_TRIMMED, }; stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_write_checkpoint(sbi, &cpc); } /* * normally superblock is clean, so we need to release this. * In addition, EIO will skip do checkpoint, we need this as well. */ f2fs_release_ino_entry(sbi, true); f2fs_leave_shrinker(sbi); mutex_unlock(&sbi->umount_mutex); /* our cp_error case, we can wait for any writeback page */ f2fs_flush_merged_writes(sbi); f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); if (err || f2fs_cp_error(sbi)) { truncate_inode_pages_final(NODE_MAPPING(sbi)); truncate_inode_pages_final(META_MAPPING(sbi)); } for (i = 0; i < NR_COUNT_TYPE; i++) { if (!get_pages(sbi, i)) continue; f2fs_err(sbi, "detect filesystem reference count leak during " "umount, type: %d, count: %lld", i, get_pages(sbi, i)); f2fs_bug_on(sbi, 1); } f2fs_bug_on(sbi, sbi->fsync_node_num); f2fs_destroy_compress_inode(sbi); iput(sbi->node_inode); sbi->node_inode = NULL; iput(sbi->meta_inode); sbi->meta_inode = NULL; /* * iput() can update stat information, if f2fs_write_checkpoint() * above failed with error. */ f2fs_destroy_stats(sbi); /* destroy f2fs internal modules */ f2fs_destroy_node_manager(sbi); f2fs_destroy_segment_manager(sbi); /* flush s_error_work before sbi destroy */ flush_work(&sbi->s_error_work); f2fs_destroy_post_read_wq(sbi); kvfree(sbi->ckpt); if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->raw_super); f2fs_destroy_page_array_cache(sbi); f2fs_destroy_xattr_caches(sbi); mempool_destroy(sbi->write_io_dummy); #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) kfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy); destroy_percpu_info(sbi); f2fs_destroy_iostat(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif } int f2fs_sync_fs(struct super_block *sb, int sync) { struct f2fs_sb_info *sbi = F2FS_SB(sb); int err = 0; if (unlikely(f2fs_cp_error(sbi))) return 0; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return 0; trace_f2fs_sync_fs(sb, sync); if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return -EAGAIN; if (sync) { stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_issue_checkpoint(sbi); } return err; } static int f2fs_freeze(struct super_block *sb) { if (f2fs_readonly(sb)) return 0; /* IO error happened before */ if (unlikely(f2fs_cp_error(F2FS_SB(sb)))) return -EIO; /* must be clean, since sync_filesystem() was already called */ if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) return -EINVAL; /* Let's flush checkpoints and stop the thread. */ f2fs_flush_ckpt_thread(F2FS_SB(sb)); /* to avoid deadlock on f2fs_evict_inode->SB_FREEZE_FS */ set_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING); return 0; } static int f2fs_unfreeze(struct super_block *sb) { clear_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING); return 0; } #ifdef CONFIG_QUOTA static int f2fs_statfs_project(struct super_block *sb, kprojid_t projid, struct kstatfs *buf) { struct kqid qid; struct dquot *dquot; u64 limit; u64 curblock; qid = make_kqid_projid(projid); dquot = dqget(sb, qid); if (IS_ERR(dquot)) return PTR_ERR(dquot); spin_lock(&dquot->dq_dqb_lock); limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit, dquot->dq_dqb.dqb_bhardlimit); if (limit) limit >>= sb->s_blocksize_bits; if (limit && buf->f_blocks > limit) { curblock = (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; buf->f_blocks = limit; buf->f_bfree = buf->f_bavail = (buf->f_blocks > curblock) ? (buf->f_blocks - curblock) : 0; } limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, dquot->dq_dqb.dqb_ihardlimit); if (limit && buf->f_files > limit) { buf->f_files = limit; buf->f_ffree = (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; } spin_unlock(&dquot->dq_dqb_lock); dqput(dquot); return 0; } #endif static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct f2fs_sb_info *sbi = F2FS_SB(sb); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); block_t total_count, user_block_count, start_count; u64 avail_node_count; unsigned int total_valid_node_count; total_count = le64_to_cpu(sbi->raw_super->block_count); start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr); buf->f_type = F2FS_SUPER_MAGIC; buf->f_bsize = sbi->blocksize; buf->f_blocks = total_count - start_count; spin_lock(&sbi->stat_lock); user_block_count = sbi->user_block_count; total_valid_node_count = valid_node_count(sbi); avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; buf->f_bfree = user_block_count - valid_user_blocks(sbi) - sbi->current_reserved_blocks; if (unlikely(buf->f_bfree <= sbi->unusable_block_count)) buf->f_bfree = 0; else buf->f_bfree -= sbi->unusable_block_count; spin_unlock(&sbi->stat_lock); if (buf->f_bfree > F2FS_OPTION(sbi).root_reserved_blocks) buf->f_bavail = buf->f_bfree - F2FS_OPTION(sbi).root_reserved_blocks; else buf->f_bavail = 0; if (avail_node_count > user_block_count) { buf->f_files = user_block_count; buf->f_ffree = buf->f_bavail; } else { buf->f_files = avail_node_count; buf->f_ffree = min(avail_node_count - total_valid_node_count, buf->f_bavail); } buf->f_namelen = F2FS_NAME_LEN; buf->f_fsid = u64_to_fsid(id); #ifdef CONFIG_QUOTA if (is_inode_flag_set(dentry->d_inode, FI_PROJ_INHERIT) && sb_has_quota_limits_enabled(sb, PRJQUOTA)) { f2fs_statfs_project(sb, F2FS_I(dentry->d_inode)->i_projid, buf); } #endif return 0; } static inline void f2fs_show_quota_options(struct seq_file *seq, struct super_block *sb) { #ifdef CONFIG_QUOTA struct f2fs_sb_info *sbi = F2FS_SB(sb); if (F2FS_OPTION(sbi).s_jquota_fmt) { char *fmtname = ""; switch (F2FS_OPTION(sbi).s_jquota_fmt) { case QFMT_VFS_OLD: fmtname = "vfsold"; break; case QFMT_VFS_V0: fmtname = "vfsv0"; break; case QFMT_VFS_V1: fmtname = "vfsv1"; break; } seq_printf(seq, ",jqfmt=%s", fmtname); } if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA]) seq_show_option(seq, "usrjquota", F2FS_OPTION(sbi).s_qf_names[USRQUOTA]); if (F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]) seq_show_option(seq, "grpjquota", F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]); if (F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) seq_show_option(seq, "prjjquota", F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]); #endif } #ifdef CONFIG_F2FS_FS_COMPRESSION static inline void f2fs_show_compress_options(struct seq_file *seq, struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); char *algtype = ""; int i; if (!f2fs_sb_has_compression(sbi)) return; switch (F2FS_OPTION(sbi).compress_algorithm) { case COMPRESS_LZO: algtype = "lzo"; break; case COMPRESS_LZ4: algtype = "lz4"; break; case COMPRESS_ZSTD: algtype = "zstd"; break; case COMPRESS_LZORLE: algtype = "lzo-rle"; break; } seq_printf(seq, ",compress_algorithm=%s", algtype); if (F2FS_OPTION(sbi).compress_level) seq_printf(seq, ":%d", F2FS_OPTION(sbi).compress_level); seq_printf(seq, ",compress_log_size=%u", F2FS_OPTION(sbi).compress_log_size); for (i = 0; i < F2FS_OPTION(sbi).compress_ext_cnt; i++) { seq_printf(seq, ",compress_extension=%s", F2FS_OPTION(sbi).extensions[i]); } for (i = 0; i < F2FS_OPTION(sbi).nocompress_ext_cnt; i++) { seq_printf(seq, ",nocompress_extension=%s", F2FS_OPTION(sbi).noextensions[i]); } if (F2FS_OPTION(sbi).compress_chksum) seq_puts(seq, ",compress_chksum"); if (F2FS_OPTION(sbi).compress_mode == COMPR_MODE_FS) seq_printf(seq, ",compress_mode=%s", "fs"); else if (F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER) seq_printf(seq, ",compress_mode=%s", "user"); if (test_opt(sbi, COMPRESS_CACHE)) seq_puts(seq, ",compress_cache"); } #endif static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC) seq_printf(seq, ",background_gc=%s", "sync"); else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_ON) seq_printf(seq, ",background_gc=%s", "on"); else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF) seq_printf(seq, ",background_gc=%s", "off"); if (test_opt(sbi, GC_MERGE)) seq_puts(seq, ",gc_merge"); else seq_puts(seq, ",nogc_merge"); if (test_opt(sbi, DISABLE_ROLL_FORWARD)) seq_puts(seq, ",disable_roll_forward"); if (test_opt(sbi, NORECOVERY)) seq_puts(seq, ",norecovery"); if (test_opt(sbi, DISCARD)) { seq_puts(seq, ",discard"); if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK) seq_printf(seq, ",discard_unit=%s", "block"); else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) seq_printf(seq, ",discard_unit=%s", "segment"); else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) seq_printf(seq, ",discard_unit=%s", "section"); } else { seq_puts(seq, ",nodiscard"); } if (test_opt(sbi, NOHEAP)) seq_puts(seq, ",no_heap"); else seq_puts(seq, ",heap"); #ifdef CONFIG_F2FS_FS_XATTR if (test_opt(sbi, XATTR_USER)) seq_puts(seq, ",user_xattr"); else seq_puts(seq, ",nouser_xattr"); if (test_opt(sbi, INLINE_XATTR)) seq_puts(seq, ",inline_xattr"); else seq_puts(seq, ",noinline_xattr"); if (test_opt(sbi, INLINE_XATTR_SIZE)) seq_printf(seq, ",inline_xattr_size=%u", F2FS_OPTION(sbi).inline_xattr_size); #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL if (test_opt(sbi, POSIX_ACL)) seq_puts(seq, ",acl"); else seq_puts(seq, ",noacl"); #endif if (test_opt(sbi, DISABLE_EXT_IDENTIFY)) seq_puts(seq, ",disable_ext_identify"); if (test_opt(sbi, INLINE_DATA)) seq_puts(seq, ",inline_data"); else seq_puts(seq, ",noinline_data"); if (test_opt(sbi, INLINE_DENTRY)) seq_puts(seq, ",inline_dentry"); else seq_puts(seq, ",noinline_dentry"); if (test_opt(sbi, FLUSH_MERGE)) seq_puts(seq, ",flush_merge"); else seq_puts(seq, ",noflush_merge"); if (test_opt(sbi, NOBARRIER)) seq_puts(seq, ",nobarrier"); else seq_puts(seq, ",barrier"); if (test_opt(sbi, FASTBOOT)) seq_puts(seq, ",fastboot"); if (test_opt(sbi, READ_EXTENT_CACHE)) seq_puts(seq, ",extent_cache"); else seq_puts(seq, ",noextent_cache"); if (test_opt(sbi, AGE_EXTENT_CACHE)) seq_puts(seq, ",age_extent_cache"); if (test_opt(sbi, DATA_FLUSH)) seq_puts(seq, ",data_flush"); seq_puts(seq, ",mode="); if (F2FS_OPTION(sbi).fs_mode == FS_MODE_ADAPTIVE) seq_puts(seq, "adaptive"); else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS) seq_puts(seq, "lfs"); else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_SEG) seq_puts(seq, "fragment:segment"); else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) seq_puts(seq, "fragment:block"); seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs); if (test_opt(sbi, RESERVE_ROOT)) seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u", F2FS_OPTION(sbi).root_reserved_blocks, from_kuid_munged(&init_user_ns, F2FS_OPTION(sbi).s_resuid), from_kgid_munged(&init_user_ns, F2FS_OPTION(sbi).s_resgid)); if (F2FS_IO_SIZE_BITS(sbi)) seq_printf(seq, ",io_bits=%u", F2FS_OPTION(sbi).write_io_size_bits); #ifdef CONFIG_F2FS_FAULT_INJECTION if (test_opt(sbi, FAULT_INJECTION)) { seq_printf(seq, ",fault_injection=%u", F2FS_OPTION(sbi).fault_info.inject_rate); seq_printf(seq, ",fault_type=%u", F2FS_OPTION(sbi).fault_info.inject_type); } #endif #ifdef CONFIG_QUOTA if (test_opt(sbi, QUOTA)) seq_puts(seq, ",quota"); if (test_opt(sbi, USRQUOTA)) seq_puts(seq, ",usrquota"); if (test_opt(sbi, GRPQUOTA)) seq_puts(seq, ",grpquota"); if (test_opt(sbi, PRJQUOTA)) seq_puts(seq, ",prjquota"); #endif f2fs_show_quota_options(seq, sbi->sb); fscrypt_show_test_dummy_encryption(seq, ',', sbi->sb); if (sbi->sb->s_flags & SB_INLINECRYPT) seq_puts(seq, ",inlinecrypt"); if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_DEFAULT) seq_printf(seq, ",alloc_mode=%s", "default"); else if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE) seq_printf(seq, ",alloc_mode=%s", "reuse"); if (test_opt(sbi, DISABLE_CHECKPOINT)) seq_printf(seq, ",checkpoint=disable:%u", F2FS_OPTION(sbi).unusable_cap); if (test_opt(sbi, MERGE_CHECKPOINT)) seq_puts(seq, ",checkpoint_merge"); else seq_puts(seq, ",nocheckpoint_merge"); if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_POSIX) seq_printf(seq, ",fsync_mode=%s", "posix"); else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) seq_printf(seq, ",fsync_mode=%s", "strict"); else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_NOBARRIER) seq_printf(seq, ",fsync_mode=%s", "nobarrier"); #ifdef CONFIG_F2FS_FS_COMPRESSION f2fs_show_compress_options(seq, sbi->sb); #endif if (test_opt(sbi, ATGC)) seq_puts(seq, ",atgc"); if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_NORMAL) seq_printf(seq, ",memory=%s", "normal"); else if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW) seq_printf(seq, ",memory=%s", "low"); if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_READONLY) seq_printf(seq, ",errors=%s", "remount-ro"); else if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_CONTINUE) seq_printf(seq, ",errors=%s", "continue"); else if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_PANIC) seq_printf(seq, ",errors=%s", "panic"); return 0; } static void default_options(struct f2fs_sb_info *sbi, bool remount) { /* init some FS parameters */ if (!remount) { set_opt(sbi, READ_EXTENT_CACHE); clear_opt(sbi, DISABLE_CHECKPOINT); if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) set_opt(sbi, DISCARD); if (f2fs_sb_has_blkzoned(sbi)) F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_SECTION; else F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_BLOCK; } if (f2fs_sb_has_readonly(sbi)) F2FS_OPTION(sbi).active_logs = NR_CURSEG_RO_TYPE; else F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE; F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; if (le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_main) <= SMALL_VOLUME_SEGMENTS) F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; else F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); if (f2fs_sb_has_compression(sbi)) { F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; F2FS_OPTION(sbi).compress_log_size = MIN_COMPRESS_LOG_SIZE; F2FS_OPTION(sbi).compress_ext_cnt = 0; F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS; } F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON; F2FS_OPTION(sbi).memory_mode = MEMORY_MODE_NORMAL; F2FS_OPTION(sbi).errors = MOUNT_ERRORS_CONTINUE; sbi->sb->s_flags &= ~SB_INLINECRYPT; set_opt(sbi, INLINE_XATTR); set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); set_opt(sbi, NOHEAP); set_opt(sbi, MERGE_CHECKPOINT); F2FS_OPTION(sbi).unusable_cap = 0; sbi->sb->s_flags |= SB_LAZYTIME; if (!f2fs_is_readonly(sbi)) set_opt(sbi, FLUSH_MERGE); if (f2fs_sb_has_blkzoned(sbi)) F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; else F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; #ifdef CONFIG_F2FS_FS_XATTR set_opt(sbi, XATTR_USER); #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL set_opt(sbi, POSIX_ACL); #endif f2fs_build_fault_attr(sbi, 0, 0); } #ifdef CONFIG_QUOTA static int f2fs_enable_quotas(struct super_block *sb); #endif static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) { unsigned int s_flags = sbi->sb->s_flags; struct cp_control cpc; unsigned int gc_mode = sbi->gc_mode; int err = 0; int ret; block_t unusable; if (s_flags & SB_RDONLY) { f2fs_err(sbi, "checkpoint=disable on readonly fs"); return -EINVAL; } sbi->sb->s_flags |= SB_ACTIVE; /* check if we need more GC first */ unusable = f2fs_get_unusable_blocks(sbi); if (!f2fs_disable_cp_again(sbi, unusable)) goto skip_gc; f2fs_update_time(sbi, DISABLE_TIME); sbi->gc_mode = GC_URGENT_HIGH; while (!f2fs_time_over(sbi, DISABLE_TIME)) { struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, .init_gc_type = FG_GC, .should_migrate_blocks = false, .err_gc_skipped = true, .nr_free_secs = 1 }; f2fs_down_write(&sbi->gc_lock); stat_inc_gc_call_count(sbi, FOREGROUND); err = f2fs_gc(sbi, &gc_control); if (err == -ENODATA) { err = 0; break; } if (err && err != -EAGAIN) break; } ret = sync_filesystem(sbi->sb); if (ret || err) { err = ret ? ret : err; goto restore_flag; } unusable = f2fs_get_unusable_blocks(sbi); if (f2fs_disable_cp_again(sbi, unusable)) { err = -EAGAIN; goto restore_flag; } skip_gc: f2fs_down_write(&sbi->gc_lock); cpc.reason = CP_PAUSE; set_sbi_flag(sbi, SBI_CP_DISABLED); stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_write_checkpoint(sbi, &cpc); if (err) goto out_unlock; spin_lock(&sbi->stat_lock); sbi->unusable_block_count = unusable; spin_unlock(&sbi->stat_lock); out_unlock: f2fs_up_write(&sbi->gc_lock); restore_flag: sbi->gc_mode = gc_mode; sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ return err; } static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) { int retry = DEFAULT_RETRY_IO_COUNT; /* we should flush all the data to keep data consistency */ do { sync_inodes_sb(sbi->sb); f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); } while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--); if (unlikely(retry < 0)) f2fs_warn(sbi, "checkpoint=enable has some unwritten data."); f2fs_down_write(&sbi->gc_lock); f2fs_dirty_to_prefree(sbi); clear_sbi_flag(sbi, SBI_CP_DISABLED); set_sbi_flag(sbi, SBI_IS_DIRTY); f2fs_up_write(&sbi->gc_lock); f2fs_sync_fs(sbi->sb, 1); /* Let's ensure there's no pending checkpoint anymore */ f2fs_flush_ckpt_thread(sbi); } static int f2fs_remount(struct super_block *sb, int *flags, char *data) { struct f2fs_sb_info *sbi = F2FS_SB(sb); struct f2fs_mount_info org_mount_opt; unsigned long old_sb_flags; int err; bool need_restart_gc = false, need_stop_gc = false; bool need_restart_flush = false, need_stop_flush = false; bool need_restart_discard = false, need_stop_discard = false; bool need_enable_checkpoint = false, need_disable_checkpoint = false; bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE); bool no_age_extent_cache = !test_opt(sbi, AGE_EXTENT_CACHE); bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT); bool no_io_align = !F2FS_IO_ALIGNED(sbi); bool no_atgc = !test_opt(sbi, ATGC); bool no_discard = !test_opt(sbi, DISCARD); bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE); bool block_unit_discard = f2fs_block_unit_discard(sbi); #ifdef CONFIG_QUOTA int i, j; #endif /* * Save the old mount options in case we * need to restore them. */ org_mount_opt = sbi->mount_opt; old_sb_flags = sb->s_flags; #ifdef CONFIG_QUOTA org_mount_opt.s_jquota_fmt = F2FS_OPTION(sbi).s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { if (F2FS_OPTION(sbi).s_qf_names[i]) { org_mount_opt.s_qf_names[i] = kstrdup(F2FS_OPTION(sbi).s_qf_names[i], GFP_KERNEL); if (!org_mount_opt.s_qf_names[i]) { for (j = 0; j < i; j++) kfree(org_mount_opt.s_qf_names[j]); return -ENOMEM; } } else { org_mount_opt.s_qf_names[i] = NULL; } } #endif /* recover superblocks we couldn't write due to previous RO mount */ if (!(*flags & SB_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { err = f2fs_commit_super(sbi, false); f2fs_info(sbi, "Try to recover all the superblocks, ret: %d", err); if (!err) clear_sbi_flag(sbi, SBI_NEED_SB_WRITE); } default_options(sbi, true); /* parse mount options */ err = parse_options(sb, data, true); if (err) goto restore_opts; /* flush outstanding errors before changing fs state */ flush_work(&sbi->s_error_work); /* * Previous and new state of filesystem is RO, * so skip checking GC and FLUSH_MERGE conditions. */ if (f2fs_readonly(sb) && (*flags & SB_RDONLY)) goto skip; if (f2fs_dev_is_readonly(sbi) && !(*flags & SB_RDONLY)) { err = -EROFS; goto restore_opts; } #ifdef CONFIG_QUOTA if (!f2fs_readonly(sb) && (*flags & SB_RDONLY)) { err = dquot_suspend(sb, -1); if (err < 0) goto restore_opts; } else if (f2fs_readonly(sb) && !(*flags & SB_RDONLY)) { /* dquot_resume needs RW */ sb->s_flags &= ~SB_RDONLY; if (sb_any_quota_suspended(sb)) { dquot_resume(sb, -1); } else if (f2fs_sb_has_quota_ino(sbi)) { err = f2fs_enable_quotas(sb); if (err) goto restore_opts; } } #endif if (f2fs_lfs_mode(sbi) && !IS_F2FS_IPU_DISABLE(sbi)) { err = -EINVAL; f2fs_warn(sbi, "LFS is not compatible with IPU"); goto restore_opts; } /* disallow enable atgc dynamically */ if (no_atgc == !!test_opt(sbi, ATGC)) { err = -EINVAL; f2fs_warn(sbi, "switch atgc option is not allowed"); goto restore_opts; } /* disallow enable/disable extent_cache dynamically */ if (no_read_extent_cache == !!test_opt(sbi, READ_EXTENT_CACHE)) { err = -EINVAL; f2fs_warn(sbi, "switch extent_cache option is not allowed"); goto restore_opts; } /* disallow enable/disable age extent_cache dynamically */ if (no_age_extent_cache == !!test_opt(sbi, AGE_EXTENT_CACHE)) { err = -EINVAL; f2fs_warn(sbi, "switch age_extent_cache option is not allowed"); goto restore_opts; } if (no_io_align == !!F2FS_IO_ALIGNED(sbi)) { err = -EINVAL; f2fs_warn(sbi, "switch io_bits option is not allowed"); goto restore_opts; } if (no_compress_cache == !!test_opt(sbi, COMPRESS_CACHE)) { err = -EINVAL; f2fs_warn(sbi, "switch compress_cache option is not allowed"); goto restore_opts; } if (block_unit_discard != f2fs_block_unit_discard(sbi)) { err = -EINVAL; f2fs_warn(sbi, "switch discard_unit option is not allowed"); goto restore_opts; } if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) { err = -EINVAL; f2fs_warn(sbi, "disabling checkpoint not compatible with read-only"); goto restore_opts; } /* * We stop the GC thread if FS is mounted as RO * or if background_gc = off is passed in mount * option. Also sync the filesystem. */ if ((*flags & SB_RDONLY) || (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF && !test_opt(sbi, GC_MERGE))) { if (sbi->gc_thread) { f2fs_stop_gc_thread(sbi); need_restart_gc = true; } } else if (!sbi->gc_thread) { err = f2fs_start_gc_thread(sbi); if (err) goto restore_opts; need_stop_gc = true; } if (*flags & SB_RDONLY) { sync_inodes_sb(sb); set_sbi_flag(sbi, SBI_IS_DIRTY); set_sbi_flag(sbi, SBI_IS_CLOSE); f2fs_sync_fs(sb, 1); clear_sbi_flag(sbi, SBI_IS_CLOSE); } /* * We stop issue flush thread if FS is mounted as RO * or if flush_merge is not passed in mount option. */ if ((*flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { clear_opt(sbi, FLUSH_MERGE); f2fs_destroy_flush_cmd_control(sbi, false); need_restart_flush = true; } else { err = f2fs_create_flush_cmd_control(sbi); if (err) goto restore_gc; need_stop_flush = true; } if (no_discard == !!test_opt(sbi, DISCARD)) { if (test_opt(sbi, DISCARD)) { err = f2fs_start_discard_thread(sbi); if (err) goto restore_flush; need_stop_discard = true; } else { f2fs_stop_discard_thread(sbi); f2fs_issue_discard_timeout(sbi); need_restart_discard = true; } } if (enable_checkpoint == !!test_opt(sbi, DISABLE_CHECKPOINT)) { if (test_opt(sbi, DISABLE_CHECKPOINT)) { err = f2fs_disable_checkpoint(sbi); if (err) goto restore_discard; need_enable_checkpoint = true; } else { f2fs_enable_checkpoint(sbi); need_disable_checkpoint = true; } } /* * Place this routine at the end, since a new checkpoint would be * triggered while remount and we need to take care of it before * returning from remount. */ if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) || !test_opt(sbi, MERGE_CHECKPOINT)) { f2fs_stop_ckpt_thread(sbi); } else { /* Flush if the prevous checkpoint, if exists. */ f2fs_flush_ckpt_thread(sbi); err = f2fs_start_ckpt_thread(sbi); if (err) { f2fs_err(sbi, "Failed to start F2FS issue_checkpoint_thread (%d)", err); goto restore_checkpoint; } } skip: #ifdef CONFIG_QUOTA /* Release old quota file names */ for (i = 0; i < MAXQUOTAS; i++) kfree(org_mount_opt.s_qf_names[i]); #endif /* Update the POSIXACL Flag */ sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); limit_reserve_root(sbi); adjust_unusable_cap_perc(sbi); *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); return 0; restore_checkpoint: if (need_enable_checkpoint) { f2fs_enable_checkpoint(sbi); } else if (need_disable_checkpoint) { if (f2fs_disable_checkpoint(sbi)) f2fs_warn(sbi, "checkpoint has not been disabled"); } restore_discard: if (need_restart_discard) { if (f2fs_start_discard_thread(sbi)) f2fs_warn(sbi, "discard has been stopped"); } else if (need_stop_discard) { f2fs_stop_discard_thread(sbi); } restore_flush: if (need_restart_flush) { if (f2fs_create_flush_cmd_control(sbi)) f2fs_warn(sbi, "background flush thread has stopped"); } else if (need_stop_flush) { clear_opt(sbi, FLUSH_MERGE); f2fs_destroy_flush_cmd_control(sbi, false); } restore_gc: if (need_restart_gc) { if (f2fs_start_gc_thread(sbi)) f2fs_warn(sbi, "background gc thread has stopped"); } else if (need_stop_gc) { f2fs_stop_gc_thread(sbi); } restore_opts: #ifdef CONFIG_QUOTA F2FS_OPTION(sbi).s_jquota_fmt = org_mount_opt.s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { kfree(F2FS_OPTION(sbi).s_qf_names[i]); F2FS_OPTION(sbi).s_qf_names[i] = org_mount_opt.s_qf_names[i]; } #endif sbi->mount_opt = org_mount_opt; sb->s_flags = old_sb_flags; return err; } #ifdef CONFIG_QUOTA static bool f2fs_need_recovery(struct f2fs_sb_info *sbi) { /* need to recovery orphan */ if (is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return true; /* need to recovery data */ if (test_opt(sbi, DISABLE_ROLL_FORWARD)) return false; if (test_opt(sbi, NORECOVERY)) return false; return !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG); } static bool f2fs_recover_quota_begin(struct f2fs_sb_info *sbi) { bool readonly = f2fs_readonly(sbi->sb); if (!f2fs_need_recovery(sbi)) return false; /* it doesn't need to check f2fs_sb_has_readonly() */ if (f2fs_hw_is_readonly(sbi)) return false; if (readonly) { sbi->sb->s_flags &= ~SB_RDONLY; set_sbi_flag(sbi, SBI_IS_WRITABLE); } /* * Turn on quotas which were not enabled for read-only mounts if * filesystem has quota feature, so that they are updated correctly. */ return f2fs_enable_quota_files(sbi, readonly); } static void f2fs_recover_quota_end(struct f2fs_sb_info *sbi, bool quota_enabled) { if (quota_enabled) f2fs_quota_off_umount(sbi->sb); if (is_sbi_flag_set(sbi, SBI_IS_WRITABLE)) { clear_sbi_flag(sbi, SBI_IS_WRITABLE); sbi->sb->s_flags |= SB_RDONLY; } } /* Read data from quotafile */ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off) { struct inode *inode = sb_dqopt(sb)->files[type]; struct address_space *mapping = inode->i_mapping; block_t blkidx = F2FS_BYTES_TO_BLK(off); int offset = off & (sb->s_blocksize - 1); int tocopy; size_t toread; loff_t i_size = i_size_read(inode); struct page *page; if (off > i_size) return 0; if (off + len > i_size) len = i_size - off; toread = len; while (toread > 0) { tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); repeat: page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS); if (IS_ERR(page)) { if (PTR_ERR(page) == -ENOMEM) { memalloc_retry_wait(GFP_NOFS); goto repeat; } set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); return PTR_ERR(page); } lock_page(page); if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); return -EIO; } memcpy_from_page(data, page, offset, tocopy); f2fs_put_page(page, 1); offset = 0; toread -= tocopy; data += tocopy; blkidx++; } return len; } /* Write to quotafile */ static ssize_t f2fs_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off) { struct inode *inode = sb_dqopt(sb)->files[type]; struct address_space *mapping = inode->i_mapping; const struct address_space_operations *a_ops = mapping->a_ops; int offset = off & (sb->s_blocksize - 1); size_t towrite = len; struct page *page; void *fsdata = NULL; int err = 0; int tocopy; while (towrite > 0) { tocopy = min_t(unsigned long, sb->s_blocksize - offset, towrite); retry: err = a_ops->write_begin(NULL, mapping, off, tocopy, &page, &fsdata); if (unlikely(err)) { if (err == -ENOMEM) { f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto retry; } set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); break; } memcpy_to_page(page, offset, data, tocopy); a_ops->write_end(NULL, mapping, off, tocopy, tocopy, page, fsdata); offset = 0; towrite -= tocopy; off += tocopy; data += tocopy; cond_resched(); } if (len == towrite) return err; inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); f2fs_mark_inode_dirty_sync(inode, false); return len - towrite; } int f2fs_dquot_initialize(struct inode *inode) { if (time_to_inject(F2FS_I_SB(inode), FAULT_DQUOT_INIT)) return -ESRCH; return dquot_initialize(inode); } static struct dquot **f2fs_get_dquots(struct inode *inode) { return F2FS_I(inode)->i_dquot; } static qsize_t *f2fs_get_reserved_space(struct inode *inode) { return &F2FS_I(inode)->i_reserved_quota; } static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type) { if (is_set_ckpt_flags(sbi, CP_QUOTA_NEED_FSCK_FLAG)) { f2fs_err(sbi, "quota sysfile may be corrupted, skip loading it"); return 0; } return dquot_quota_on_mount(sbi->sb, F2FS_OPTION(sbi).s_qf_names[type], F2FS_OPTION(sbi).s_jquota_fmt, type); } int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly) { int enabled = 0; int i, err; if (f2fs_sb_has_quota_ino(sbi) && rdonly) { err = f2fs_enable_quotas(sbi->sb); if (err) { f2fs_err(sbi, "Cannot turn on quota_ino: %d", err); return 0; } return 1; } for (i = 0; i < MAXQUOTAS; i++) { if (F2FS_OPTION(sbi).s_qf_names[i]) { err = f2fs_quota_on_mount(sbi, i); if (!err) { enabled = 1; continue; } f2fs_err(sbi, "Cannot turn on quotas: %d on %d", err, i); } } return enabled; } static int f2fs_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags) { struct inode *qf_inode; unsigned long qf_inum; unsigned long qf_flag = F2FS_QUOTA_DEFAULT_FL; int err; BUG_ON(!f2fs_sb_has_quota_ino(F2FS_SB(sb))); qf_inum = f2fs_qf_ino(sb, type); if (!qf_inum) return -EPERM; qf_inode = f2fs_iget(sb, qf_inum); if (IS_ERR(qf_inode)) { f2fs_err(F2FS_SB(sb), "Bad quota inode %u:%lu", type, qf_inum); return PTR_ERR(qf_inode); } /* Don't account quota for quota files to avoid recursion */ inode_lock(qf_inode); qf_inode->i_flags |= S_NOQUOTA; if ((F2FS_I(qf_inode)->i_flags & qf_flag) != qf_flag) { F2FS_I(qf_inode)->i_flags |= qf_flag; f2fs_set_inode_flags(qf_inode); } inode_unlock(qf_inode); err = dquot_load_quota_inode(qf_inode, type, format_id, flags); iput(qf_inode); return err; } static int f2fs_enable_quotas(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); int type, err = 0; unsigned long qf_inum; bool quota_mopt[MAXQUOTAS] = { test_opt(sbi, USRQUOTA), test_opt(sbi, GRPQUOTA), test_opt(sbi, PRJQUOTA), }; if (is_set_ckpt_flags(F2FS_SB(sb), CP_QUOTA_NEED_FSCK_FLAG)) { f2fs_err(sbi, "quota file may be corrupted, skip loading it"); return 0; } sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; for (type = 0; type < MAXQUOTAS; type++) { qf_inum = f2fs_qf_ino(sb, type); if (qf_inum) { err = f2fs_quota_enable(sb, type, QFMT_VFS_V1, DQUOT_USAGE_ENABLED | (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); if (err) { f2fs_err(sbi, "Failed to enable quota tracking (type=%d, err=%d). Please run fsck to fix.", type, err); for (type--; type >= 0; type--) dquot_quota_off(sb, type); set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); return err; } } } return 0; } static int f2fs_quota_sync_file(struct f2fs_sb_info *sbi, int type) { struct quota_info *dqopt = sb_dqopt(sbi->sb); struct address_space *mapping = dqopt->files[type]->i_mapping; int ret = 0; ret = dquot_writeback_dquots(sbi->sb, type); if (ret) goto out; ret = filemap_fdatawrite(mapping); if (ret) goto out; /* if we are using journalled quota */ if (is_journalled_quota(sbi)) goto out; ret = filemap_fdatawait(mapping); truncate_inode_pages(&dqopt->files[type]->i_data, 0); out: if (ret) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); return ret; } int f2fs_quota_sync(struct super_block *sb, int type) { struct f2fs_sb_info *sbi = F2FS_SB(sb); struct quota_info *dqopt = sb_dqopt(sb); int cnt; int ret = 0; /* * Now when everything is written we can discard the pagecache so * that userspace sees the changes. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; if (!sb_has_quota_active(sb, cnt)) continue; if (!f2fs_sb_has_quota_ino(sbi)) inode_lock(dqopt->files[cnt]); /* * do_quotactl * f2fs_quota_sync * f2fs_down_read(quota_sem) * dquot_writeback_dquots() * f2fs_dquot_commit * block_operation * f2fs_down_read(quota_sem) */ f2fs_lock_op(sbi); f2fs_down_read(&sbi->quota_sem); ret = f2fs_quota_sync_file(sbi, cnt); f2fs_up_read(&sbi->quota_sem); f2fs_unlock_op(sbi); if (!f2fs_sb_has_quota_ino(sbi)) inode_unlock(dqopt->files[cnt]); if (ret) break; } return ret; } static int f2fs_quota_on(struct super_block *sb, int type, int format_id, const struct path *path) { struct inode *inode; int err; /* if quota sysfile exists, deny enabling quota with specific file */ if (f2fs_sb_has_quota_ino(F2FS_SB(sb))) { f2fs_err(F2FS_SB(sb), "quota sysfile already exists"); return -EBUSY; } if (path->dentry->d_sb != sb) return -EXDEV; err = f2fs_quota_sync(sb, type); if (err) return err; inode = d_inode(path->dentry); err = filemap_fdatawrite(inode->i_mapping); if (err) return err; err = filemap_fdatawait(inode->i_mapping); if (err) return err; err = dquot_quota_on(sb, type, format_id, path); if (err) return err; inode_lock(inode); F2FS_I(inode)->i_flags |= F2FS_QUOTA_DEFAULT_FL; f2fs_set_inode_flags(inode); inode_unlock(inode); f2fs_mark_inode_dirty_sync(inode, false); return 0; } static int __f2fs_quota_off(struct super_block *sb, int type) { struct inode *inode = sb_dqopt(sb)->files[type]; int err; if (!inode || !igrab(inode)) return dquot_quota_off(sb, type); err = f2fs_quota_sync(sb, type); if (err) goto out_put; err = dquot_quota_off(sb, type); if (err || f2fs_sb_has_quota_ino(F2FS_SB(sb))) goto out_put; inode_lock(inode); F2FS_I(inode)->i_flags &= ~F2FS_QUOTA_DEFAULT_FL; f2fs_set_inode_flags(inode); inode_unlock(inode); f2fs_mark_inode_dirty_sync(inode, false); out_put: iput(inode); return err; } static int f2fs_quota_off(struct super_block *sb, int type) { struct f2fs_sb_info *sbi = F2FS_SB(sb); int err; err = __f2fs_quota_off(sb, type); /* * quotactl can shutdown journalled quota, result in inconsistence * between quota record and fs data by following updates, tag the * flag to let fsck be aware of it. */ if (is_journalled_quota(sbi)) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); return err; } void f2fs_quota_off_umount(struct super_block *sb) { int type; int err; for (type = 0; type < MAXQUOTAS; type++) { err = __f2fs_quota_off(sb, type); if (err) { int ret = dquot_quota_off(sb, type); f2fs_err(F2FS_SB(sb), "Fail to turn off disk quota (type: %d, err: %d, ret:%d), Please run fsck to fix it.", type, err, ret); set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); } } /* * In case of checkpoint=disable, we must flush quota blocks. * This can cause NULL exception for node_inode in end_io, since * put_super already dropped it. */ sync_filesystem(sb); } static void f2fs_truncate_quota_inode_pages(struct super_block *sb) { struct quota_info *dqopt = sb_dqopt(sb); int type; for (type = 0; type < MAXQUOTAS; type++) { if (!dqopt->files[type]) continue; f2fs_inode_synced(dqopt->files[type]); } } static int f2fs_dquot_commit(struct dquot *dquot) { struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb); int ret; f2fs_down_read_nested(&sbi->quota_sem, SINGLE_DEPTH_NESTING); ret = dquot_commit(dquot); if (ret < 0) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); f2fs_up_read(&sbi->quota_sem); return ret; } static int f2fs_dquot_acquire(struct dquot *dquot) { struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb); int ret; f2fs_down_read(&sbi->quota_sem); ret = dquot_acquire(dquot); if (ret < 0) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); f2fs_up_read(&sbi->quota_sem); return ret; } static int f2fs_dquot_release(struct dquot *dquot) { struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb); int ret = dquot_release(dquot); if (ret < 0) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); return ret; } static int f2fs_dquot_mark_dquot_dirty(struct dquot *dquot) { struct super_block *sb = dquot->dq_sb; struct f2fs_sb_info *sbi = F2FS_SB(sb); int ret = dquot_mark_dquot_dirty(dquot); /* if we are using journalled quota */ if (is_journalled_quota(sbi)) set_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH); return ret; } static int f2fs_dquot_commit_info(struct super_block *sb, int type) { struct f2fs_sb_info *sbi = F2FS_SB(sb); int ret = dquot_commit_info(sb, type); if (ret < 0) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); return ret; } static int f2fs_get_projid(struct inode *inode, kprojid_t *projid) { *projid = F2FS_I(inode)->i_projid; return 0; } static const struct dquot_operations f2fs_quota_operations = { .get_reserved_space = f2fs_get_reserved_space, .write_dquot = f2fs_dquot_commit, .acquire_dquot = f2fs_dquot_acquire, .release_dquot = f2fs_dquot_release, .mark_dirty = f2fs_dquot_mark_dquot_dirty, .write_info = f2fs_dquot_commit_info, .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, .get_projid = f2fs_get_projid, .get_next_id = dquot_get_next_id, }; static const struct quotactl_ops f2fs_quotactl_ops = { .quota_on = f2fs_quota_on, .quota_off = f2fs_quota_off, .quota_sync = f2fs_quota_sync, .get_state = dquot_get_state, .set_info = dquot_set_dqinfo, .get_dqblk = dquot_get_dqblk, .set_dqblk = dquot_set_dqblk, .get_nextdqblk = dquot_get_next_dqblk, }; #else int f2fs_dquot_initialize(struct inode *inode) { return 0; } int f2fs_quota_sync(struct super_block *sb, int type) { return 0; } void f2fs_quota_off_umount(struct super_block *sb) { } #endif static const struct super_operations f2fs_sops = { .alloc_inode = f2fs_alloc_inode, .free_inode = f2fs_free_inode, .drop_inode = f2fs_drop_inode, .write_inode = f2fs_write_inode, .dirty_inode = f2fs_dirty_inode, .show_options = f2fs_show_options, #ifdef CONFIG_QUOTA .quota_read = f2fs_quota_read, .quota_write = f2fs_quota_write, .get_dquots = f2fs_get_dquots, #endif .evict_inode = f2fs_evict_inode, .put_super = f2fs_put_super, .sync_fs = f2fs_sync_fs, .freeze_fs = f2fs_freeze, .unfreeze_fs = f2fs_unfreeze, .statfs = f2fs_statfs, .remount_fs = f2fs_remount, }; #ifdef CONFIG_FS_ENCRYPTION static int f2fs_get_context(struct inode *inode, void *ctx, size_t len) { return f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len, NULL); } static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); /* * Encrypting the root directory is not allowed because fsck * expects lost+found directory to exist and remain unencrypted * if LOST_FOUND feature is enabled. * */ if (f2fs_sb_has_lost_found(sbi) && inode->i_ino == F2FS_ROOT_INO(sbi)) return -EPERM; return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len, fs_data, XATTR_CREATE); } static const union fscrypt_policy *f2fs_get_dummy_policy(struct super_block *sb) { return F2FS_OPTION(F2FS_SB(sb)).dummy_enc_policy.policy; } static bool f2fs_has_stable_inodes(struct super_block *sb) { return true; } static struct block_device **f2fs_get_devices(struct super_block *sb, unsigned int *num_devs) { struct f2fs_sb_info *sbi = F2FS_SB(sb); struct block_device **devs; int i; if (!f2fs_is_multi_device(sbi)) return NULL; devs = kmalloc_array(sbi->s_ndevs, sizeof(*devs), GFP_KERNEL); if (!devs) return ERR_PTR(-ENOMEM); for (i = 0; i < sbi->s_ndevs; i++) devs[i] = FDEV(i).bdev; *num_devs = sbi->s_ndevs; return devs; } static const struct fscrypt_operations f2fs_cryptops = { .needs_bounce_pages = 1, .has_32bit_inodes = 1, .supports_subblock_data_units = 1, .legacy_key_prefix = "f2fs:", .get_context = f2fs_get_context, .set_context = f2fs_set_context, .get_dummy_policy = f2fs_get_dummy_policy, .empty_dir = f2fs_empty_dir, .has_stable_inodes = f2fs_has_stable_inodes, .get_devices = f2fs_get_devices, }; #endif static struct inode *f2fs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; if (f2fs_check_nid_range(sbi, ino)) return ERR_PTR(-ESTALE); /* * f2fs_iget isn't quite right if the inode is currently unallocated! * However f2fs_iget currently does appropriate checks to handle stale * inodes so everything is OK. */ inode = f2fs_iget(sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); if (unlikely(generation && inode->i_generation != generation)) { /* we didn't find the right inode.. */ iput(inode); return ERR_PTR(-ESTALE); } return inode; } static struct dentry *f2fs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_dentry(sb, fid, fh_len, fh_type, f2fs_nfs_get_inode); } static struct dentry *f2fs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_parent(sb, fid, fh_len, fh_type, f2fs_nfs_get_inode); } static const struct export_operations f2fs_export_ops = { .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = f2fs_fh_to_dentry, .fh_to_parent = f2fs_fh_to_parent, .get_parent = f2fs_get_parent, }; loff_t max_file_blocks(struct inode *inode) { loff_t result = 0; loff_t leaf_count; /* * note: previously, result is equal to (DEF_ADDRS_PER_INODE - * DEFAULT_INLINE_XATTR_ADDRS), but now f2fs try to reserve more * space in inode.i_addr, it will be more safe to reassign * result as zero. */ if (inode && f2fs_compressed_file(inode)) leaf_count = ADDRS_PER_BLOCK(inode); else leaf_count = DEF_ADDRS_PER_BLOCK; /* two direct node blocks */ result += (leaf_count * 2); /* two indirect node blocks */ leaf_count *= NIDS_PER_BLOCK; result += (leaf_count * 2); /* one double indirect node block */ leaf_count *= NIDS_PER_BLOCK; result += leaf_count; /* * For compatibility with FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{64,32} with * a 4K crypto data unit, we must restrict the max filesize to what can * fit within U32_MAX + 1 data units. */ result = min(result, (((loff_t)U32_MAX + 1) * 4096) >> F2FS_BLKSIZE_BITS); return result; } static int __f2fs_commit_super(struct buffer_head *bh, struct f2fs_super_block *super) { lock_buffer(bh); if (super) memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super)); set_buffer_dirty(bh); unlock_buffer(bh); /* it's rare case, we can do fua all the time */ return __sync_dirty_buffer(bh, REQ_SYNC | REQ_PREFLUSH | REQ_FUA); } static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, struct buffer_head *bh) { struct f2fs_super_block *raw_super = (struct f2fs_super_block *) (bh->b_data + F2FS_SUPER_OFFSET); struct super_block *sb = sbi->sb; u32 segment0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); u32 cp_blkaddr = le32_to_cpu(raw_super->cp_blkaddr); u32 sit_blkaddr = le32_to_cpu(raw_super->sit_blkaddr); u32 nat_blkaddr = le32_to_cpu(raw_super->nat_blkaddr); u32 ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); u32 main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); u32 segment_count_ckpt = le32_to_cpu(raw_super->segment_count_ckpt); u32 segment_count_sit = le32_to_cpu(raw_super->segment_count_sit); u32 segment_count_nat = le32_to_cpu(raw_super->segment_count_nat); u32 segment_count_ssa = le32_to_cpu(raw_super->segment_count_ssa); u32 segment_count_main = le32_to_cpu(raw_super->segment_count_main); u32 segment_count = le32_to_cpu(raw_super->segment_count); u32 log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); u64 main_end_blkaddr = main_blkaddr + (segment_count_main << log_blocks_per_seg); u64 seg_end_blkaddr = segment0_blkaddr + (segment_count << log_blocks_per_seg); if (segment0_blkaddr != cp_blkaddr) { f2fs_info(sbi, "Mismatch start address, segment0(%u) cp_blkaddr(%u)", segment0_blkaddr, cp_blkaddr); return true; } if (cp_blkaddr + (segment_count_ckpt << log_blocks_per_seg) != sit_blkaddr) { f2fs_info(sbi, "Wrong CP boundary, start(%u) end(%u) blocks(%u)", cp_blkaddr, sit_blkaddr, segment_count_ckpt << log_blocks_per_seg); return true; } if (sit_blkaddr + (segment_count_sit << log_blocks_per_seg) != nat_blkaddr) { f2fs_info(sbi, "Wrong SIT boundary, start(%u) end(%u) blocks(%u)", sit_blkaddr, nat_blkaddr, segment_count_sit << log_blocks_per_seg); return true; } if (nat_blkaddr + (segment_count_nat << log_blocks_per_seg) != ssa_blkaddr) { f2fs_info(sbi, "Wrong NAT boundary, start(%u) end(%u) blocks(%u)", nat_blkaddr, ssa_blkaddr, segment_count_nat << log_blocks_per_seg); return true; } if (ssa_blkaddr + (segment_count_ssa << log_blocks_per_seg) != main_blkaddr) { f2fs_info(sbi, "Wrong SSA boundary, start(%u) end(%u) blocks(%u)", ssa_blkaddr, main_blkaddr, segment_count_ssa << log_blocks_per_seg); return true; } if (main_end_blkaddr > seg_end_blkaddr) { f2fs_info(sbi, "Wrong MAIN_AREA boundary, start(%u) end(%llu) block(%u)", main_blkaddr, seg_end_blkaddr, segment_count_main << log_blocks_per_seg); return true; } else if (main_end_blkaddr < seg_end_blkaddr) { int err = 0; char *res; /* fix in-memory information all the time */ raw_super->segment_count = cpu_to_le32((main_end_blkaddr - segment0_blkaddr) >> log_blocks_per_seg); if (f2fs_readonly(sb) || f2fs_hw_is_readonly(sbi)) { set_sbi_flag(sbi, SBI_NEED_SB_WRITE); res = "internally"; } else { err = __f2fs_commit_super(bh, NULL); res = err ? "failed" : "done"; } f2fs_info(sbi, "Fix alignment : %s, start(%u) end(%llu) block(%u)", res, main_blkaddr, seg_end_blkaddr, segment_count_main << log_blocks_per_seg); if (err) return true; } return false; } static int sanity_check_raw_super(struct f2fs_sb_info *sbi, struct buffer_head *bh) { block_t segment_count, segs_per_sec, secs_per_zone, segment_count_main; block_t total_sections, blocks_per_seg; struct f2fs_super_block *raw_super = (struct f2fs_super_block *) (bh->b_data + F2FS_SUPER_OFFSET); size_t crc_offset = 0; __u32 crc = 0; if (le32_to_cpu(raw_super->magic) != F2FS_SUPER_MAGIC) { f2fs_info(sbi, "Magic Mismatch, valid(0x%x) - read(0x%x)", F2FS_SUPER_MAGIC, le32_to_cpu(raw_super->magic)); return -EINVAL; } /* Check checksum_offset and crc in superblock */ if (__F2FS_HAS_FEATURE(raw_super, F2FS_FEATURE_SB_CHKSUM)) { crc_offset = le32_to_cpu(raw_super->checksum_offset); if (crc_offset != offsetof(struct f2fs_super_block, crc)) { f2fs_info(sbi, "Invalid SB checksum offset: %zu", crc_offset); return -EFSCORRUPTED; } crc = le32_to_cpu(raw_super->crc); if (!f2fs_crc_valid(sbi, crc, raw_super, crc_offset)) { f2fs_info(sbi, "Invalid SB checksum value: %u", crc); return -EFSCORRUPTED; } } /* Currently, support only 4KB block size */ if (le32_to_cpu(raw_super->log_blocksize) != F2FS_BLKSIZE_BITS) { f2fs_info(sbi, "Invalid log_blocksize (%u), supports only %u", le32_to_cpu(raw_super->log_blocksize), F2FS_BLKSIZE_BITS); return -EFSCORRUPTED; } /* check log blocks per segment */ if (le32_to_cpu(raw_super->log_blocks_per_seg) != 9) { f2fs_info(sbi, "Invalid log blocks per segment (%u)", le32_to_cpu(raw_super->log_blocks_per_seg)); return -EFSCORRUPTED; } /* Currently, support 512/1024/2048/4096/16K bytes sector size */ if (le32_to_cpu(raw_super->log_sectorsize) > F2FS_MAX_LOG_SECTOR_SIZE || le32_to_cpu(raw_super->log_sectorsize) < F2FS_MIN_LOG_SECTOR_SIZE) { f2fs_info(sbi, "Invalid log sectorsize (%u)", le32_to_cpu(raw_super->log_sectorsize)); return -EFSCORRUPTED; } if (le32_to_cpu(raw_super->log_sectors_per_block) + le32_to_cpu(raw_super->log_sectorsize) != F2FS_MAX_LOG_SECTOR_SIZE) { f2fs_info(sbi, "Invalid log sectors per block(%u) log sectorsize(%u)", le32_to_cpu(raw_super->log_sectors_per_block), le32_to_cpu(raw_super->log_sectorsize)); return -EFSCORRUPTED; } segment_count = le32_to_cpu(raw_super->segment_count); segment_count_main = le32_to_cpu(raw_super->segment_count_main); segs_per_sec = le32_to_cpu(raw_super->segs_per_sec); secs_per_zone = le32_to_cpu(raw_super->secs_per_zone); total_sections = le32_to_cpu(raw_super->section_count); /* blocks_per_seg should be 512, given the above check */ blocks_per_seg = BIT(le32_to_cpu(raw_super->log_blocks_per_seg)); if (segment_count > F2FS_MAX_SEGMENT || segment_count < F2FS_MIN_SEGMENTS) { f2fs_info(sbi, "Invalid segment count (%u)", segment_count); return -EFSCORRUPTED; } if (total_sections > segment_count_main || total_sections < 1 || segs_per_sec > segment_count || !segs_per_sec) { f2fs_info(sbi, "Invalid segment/section count (%u, %u x %u)", segment_count, total_sections, segs_per_sec); return -EFSCORRUPTED; } if (segment_count_main != total_sections * segs_per_sec) { f2fs_info(sbi, "Invalid segment/section count (%u != %u * %u)", segment_count_main, total_sections, segs_per_sec); return -EFSCORRUPTED; } if ((segment_count / segs_per_sec) < total_sections) { f2fs_info(sbi, "Small segment_count (%u < %u * %u)", segment_count, segs_per_sec, total_sections); return -EFSCORRUPTED; } if (segment_count > (le64_to_cpu(raw_super->block_count) >> 9)) { f2fs_info(sbi, "Wrong segment_count / block_count (%u > %llu)", segment_count, le64_to_cpu(raw_super->block_count)); return -EFSCORRUPTED; } if (RDEV(0).path[0]) { block_t dev_seg_count = le32_to_cpu(RDEV(0).total_segments); int i = 1; while (i < MAX_DEVICES && RDEV(i).path[0]) { dev_seg_count += le32_to_cpu(RDEV(i).total_segments); i++; } if (segment_count != dev_seg_count) { f2fs_info(sbi, "Segment count (%u) mismatch with total segments from devices (%u)", segment_count, dev_seg_count); return -EFSCORRUPTED; } } else { if (__F2FS_HAS_FEATURE(raw_super, F2FS_FEATURE_BLKZONED) && !bdev_is_zoned(sbi->sb->s_bdev)) { f2fs_info(sbi, "Zoned block device path is missing"); return -EFSCORRUPTED; } } if (secs_per_zone > total_sections || !secs_per_zone) { f2fs_info(sbi, "Wrong secs_per_zone / total_sections (%u, %u)", secs_per_zone, total_sections); return -EFSCORRUPTED; } if (le32_to_cpu(raw_super->extension_count) > F2FS_MAX_EXTENSION || raw_super->hot_ext_count > F2FS_MAX_EXTENSION || (le32_to_cpu(raw_super->extension_count) + raw_super->hot_ext_count) > F2FS_MAX_EXTENSION) { f2fs_info(sbi, "Corrupted extension count (%u + %u > %u)", le32_to_cpu(raw_super->extension_count), raw_super->hot_ext_count, F2FS_MAX_EXTENSION); return -EFSCORRUPTED; } if (le32_to_cpu(raw_super->cp_payload) >= (blocks_per_seg - F2FS_CP_PACKS - NR_CURSEG_PERSIST_TYPE)) { f2fs_info(sbi, "Insane cp_payload (%u >= %u)", le32_to_cpu(raw_super->cp_payload), blocks_per_seg - F2FS_CP_PACKS - NR_CURSEG_PERSIST_TYPE); return -EFSCORRUPTED; } /* check reserved ino info */ if (le32_to_cpu(raw_super->node_ino) != 1 || le32_to_cpu(raw_super->meta_ino) != 2 || le32_to_cpu(raw_super->root_ino) != 3) { f2fs_info(sbi, "Invalid Fs Meta Ino: node(%u) meta(%u) root(%u)", le32_to_cpu(raw_super->node_ino), le32_to_cpu(raw_super->meta_ino), le32_to_cpu(raw_super->root_ino)); return -EFSCORRUPTED; } /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ if (sanity_check_area_boundary(sbi, bh)) return -EFSCORRUPTED; return 0; } int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) { unsigned int total, fsmeta; struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned int ovp_segments, reserved_segments; unsigned int main_segs, blocks_per_seg; unsigned int sit_segs, nat_segs; unsigned int sit_bitmap_size, nat_bitmap_size; unsigned int log_blocks_per_seg; unsigned int segment_count_main; unsigned int cp_pack_start_sum, cp_payload; block_t user_block_count, valid_user_blocks; block_t avail_node_count, valid_node_count; unsigned int nat_blocks, nat_bits_bytes, nat_bits_blocks; int i, j; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); sit_segs = le32_to_cpu(raw_super->segment_count_sit); fsmeta += sit_segs; nat_segs = le32_to_cpu(raw_super->segment_count_nat); fsmeta += nat_segs; fsmeta += le32_to_cpu(ckpt->rsvd_segment_count); fsmeta += le32_to_cpu(raw_super->segment_count_ssa); if (unlikely(fsmeta >= total)) return 1; ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); if (!f2fs_sb_has_readonly(sbi) && unlikely(fsmeta < F2FS_MIN_META_SEGMENTS || ovp_segments == 0 || reserved_segments == 0)) { f2fs_err(sbi, "Wrong layout: check mkfs.f2fs version"); return 1; } user_block_count = le64_to_cpu(ckpt->user_block_count); segment_count_main = le32_to_cpu(raw_super->segment_count_main) + (f2fs_sb_has_readonly(sbi) ? 1 : 0); log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); if (!user_block_count || user_block_count >= segment_count_main << log_blocks_per_seg) { f2fs_err(sbi, "Wrong user_block_count: %u", user_block_count); return 1; } valid_user_blocks = le64_to_cpu(ckpt->valid_block_count); if (valid_user_blocks > user_block_count) { f2fs_err(sbi, "Wrong valid_user_blocks: %u, user_block_count: %u", valid_user_blocks, user_block_count); return 1; } valid_node_count = le32_to_cpu(ckpt->valid_node_count); avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; if (valid_node_count > avail_node_count) { f2fs_err(sbi, "Wrong valid_node_count: %u, avail_node_count: %u", valid_node_count, avail_node_count); return 1; } main_segs = le32_to_cpu(raw_super->segment_count_main); blocks_per_seg = sbi->blocks_per_seg; for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) return 1; if (f2fs_sb_has_readonly(sbi)) goto check_data; for (j = i + 1; j < NR_CURSEG_NODE_TYPE; j++) { if (le32_to_cpu(ckpt->cur_node_segno[i]) == le32_to_cpu(ckpt->cur_node_segno[j])) { f2fs_err(sbi, "Node segment (%u, %u) has the same segno: %u", i, j, le32_to_cpu(ckpt->cur_node_segno[i])); return 1; } } } check_data: for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs || le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) return 1; if (f2fs_sb_has_readonly(sbi)) goto skip_cross; for (j = i + 1; j < NR_CURSEG_DATA_TYPE; j++) { if (le32_to_cpu(ckpt->cur_data_segno[i]) == le32_to_cpu(ckpt->cur_data_segno[j])) { f2fs_err(sbi, "Data segment (%u, %u) has the same segno: %u", i, j, le32_to_cpu(ckpt->cur_data_segno[i])); return 1; } } } for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { for (j = 0; j < NR_CURSEG_DATA_TYPE; j++) { if (le32_to_cpu(ckpt->cur_node_segno[i]) == le32_to_cpu(ckpt->cur_data_segno[j])) { f2fs_err(sbi, "Node segment (%u) and Data segment (%u) has the same segno: %u", i, j, le32_to_cpu(ckpt->cur_node_segno[i])); return 1; } } } skip_cross: sit_bitmap_size = le32_to_cpu(ckpt->sit_ver_bitmap_bytesize); nat_bitmap_size = le32_to_cpu(ckpt->nat_ver_bitmap_bytesize); if (sit_bitmap_size != ((sit_segs / 2) << log_blocks_per_seg) / 8 || nat_bitmap_size != ((nat_segs / 2) << log_blocks_per_seg) / 8) { f2fs_err(sbi, "Wrong bitmap size: sit: %u, nat:%u", sit_bitmap_size, nat_bitmap_size); return 1; } cp_pack_start_sum = __start_sum_addr(sbi); cp_payload = __cp_payload(sbi); if (cp_pack_start_sum < cp_payload + 1 || cp_pack_start_sum > blocks_per_seg - 1 - NR_CURSEG_PERSIST_TYPE) { f2fs_err(sbi, "Wrong cp_pack_start_sum: %u", cp_pack_start_sum); return 1; } if (__is_set_ckpt_flags(ckpt, CP_LARGE_NAT_BITMAP_FLAG) && le32_to_cpu(ckpt->checksum_offset) != CP_MIN_CHKSUM_OFFSET) { f2fs_warn(sbi, "using deprecated layout of large_nat_bitmap, " "please run fsck v1.13.0 or higher to repair, chksum_offset: %u, " "fixed with patch: \"f2fs-tools: relocate chksum_offset for large_nat_bitmap feature\"", le32_to_cpu(ckpt->checksum_offset)); return 1; } nat_blocks = nat_segs << log_blocks_per_seg; nat_bits_bytes = nat_blocks / BITS_PER_BYTE; nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8); if (__is_set_ckpt_flags(ckpt, CP_NAT_BITS_FLAG) && (cp_payload + F2FS_CP_PACKS + NR_CURSEG_PERSIST_TYPE + nat_bits_blocks >= blocks_per_seg)) { f2fs_warn(sbi, "Insane cp_payload: %u, nat_bits_blocks: %u)", cp_payload, nat_bits_blocks); return 1; } if (unlikely(f2fs_cp_error(sbi))) { f2fs_err(sbi, "A bug case: need to run fsck"); return 1; } return 0; } static void init_sb_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = sbi->raw_super; int i; sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); sbi->log_blocksize = le32_to_cpu(raw_super->log_blocksize); sbi->blocksize = BIT(sbi->log_blocksize); sbi->log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); sbi->blocks_per_seg = BIT(sbi->log_blocks_per_seg); sbi->segs_per_sec = le32_to_cpu(raw_super->segs_per_sec); sbi->secs_per_zone = le32_to_cpu(raw_super->secs_per_zone); sbi->total_sections = le32_to_cpu(raw_super->section_count); sbi->total_node_count = (le32_to_cpu(raw_super->segment_count_nat) / 2) * sbi->blocks_per_seg * NAT_ENTRY_PER_BLOCK; F2FS_ROOT_INO(sbi) = le32_to_cpu(raw_super->root_ino); F2FS_NODE_INO(sbi) = le32_to_cpu(raw_super->node_ino); F2FS_META_INO(sbi) = le32_to_cpu(raw_super->meta_ino); sbi->cur_victim_sec = NULL_SECNO; sbi->gc_mode = GC_NORMAL; sbi->next_victim_seg[BG_GC] = NULL_SEGNO; sbi->next_victim_seg[FG_GC] = NULL_SEGNO; sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; sbi->migration_granularity = sbi->segs_per_sec; sbi->seq_file_ra_mul = MIN_RA_MUL; sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE; sbi->max_fragment_hole = DEF_FRAGMENT_SIZE; spin_lock_init(&sbi->gc_remaining_trials_lock); atomic64_set(&sbi->current_atomic_write, 0); sbi->dir_level = DEF_DIR_LEVEL; sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_INTERVAL; sbi->interval_time[UMOUNT_DISCARD_TIMEOUT] = DEF_UMOUNT_DISCARD_TIMEOUT; clear_sbi_flag(sbi, SBI_NEED_FSCK); for (i = 0; i < NR_COUNT_TYPE; i++) atomic_set(&sbi->nr_pages[i], 0); for (i = 0; i < META; i++) atomic_set(&sbi->wb_sync_req[i], 0); INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); init_f2fs_rwsem(&sbi->io_order_lock); spin_lock_init(&sbi->cp_lock); sbi->dirty_device = 0; spin_lock_init(&sbi->dev_lock); init_f2fs_rwsem(&sbi->sb_lock); init_f2fs_rwsem(&sbi->pin_sem); } static int init_percpu_info(struct f2fs_sb_info *sbi) { int err; err = percpu_counter_init(&sbi->alloc_valid_block_count, 0, GFP_KERNEL); if (err) return err; err = percpu_counter_init(&sbi->rf_node_block_count, 0, GFP_KERNEL); if (err) goto err_valid_block; err = percpu_counter_init(&sbi->total_valid_inode_count, 0, GFP_KERNEL); if (err) goto err_node_block; return 0; err_node_block: percpu_counter_destroy(&sbi->rf_node_block_count); err_valid_block: percpu_counter_destroy(&sbi->alloc_valid_block_count); return err; } #ifdef CONFIG_BLK_DEV_ZONED struct f2fs_report_zones_args { struct f2fs_sb_info *sbi; struct f2fs_dev_info *dev; }; static int f2fs_report_zone_cb(struct blk_zone *zone, unsigned int idx, void *data) { struct f2fs_report_zones_args *rz_args = data; block_t unusable_blocks = (zone->len - zone->capacity) >> F2FS_LOG_SECTORS_PER_BLOCK; if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return 0; set_bit(idx, rz_args->dev->blkz_seq); if (!rz_args->sbi->unusable_blocks_per_sec) { rz_args->sbi->unusable_blocks_per_sec = unusable_blocks; return 0; } if (rz_args->sbi->unusable_blocks_per_sec != unusable_blocks) { f2fs_err(rz_args->sbi, "F2FS supports single zone capacity\n"); return -EINVAL; } return 0; } static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) { struct block_device *bdev = FDEV(devi).bdev; sector_t nr_sectors = bdev_nr_sectors(bdev); struct f2fs_report_zones_args rep_zone_arg; u64 zone_sectors; int ret; if (!f2fs_sb_has_blkzoned(sbi)) return 0; zone_sectors = bdev_zone_sectors(bdev); if (!is_power_of_2(zone_sectors)) { f2fs_err(sbi, "F2FS does not support non power of 2 zone sizes\n"); return -EINVAL; } if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != SECTOR_TO_BLOCK(zone_sectors)) return -EINVAL; sbi->blocks_per_blkz = SECTOR_TO_BLOCK(zone_sectors); FDEV(devi).nr_blkz = div_u64(SECTOR_TO_BLOCK(nr_sectors), sbi->blocks_per_blkz); if (nr_sectors & (zone_sectors - 1)) FDEV(devi).nr_blkz++; FDEV(devi).blkz_seq = f2fs_kvzalloc(sbi, BITS_TO_LONGS(FDEV(devi).nr_blkz) * sizeof(unsigned long), GFP_KERNEL); if (!FDEV(devi).blkz_seq) return -ENOMEM; rep_zone_arg.sbi = sbi; rep_zone_arg.dev = &FDEV(devi); ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES, f2fs_report_zone_cb, &rep_zone_arg); if (ret < 0) return ret; return 0; } #endif /* * Read f2fs raw super block. * Because we have two copies of super block, so read both of them * to get the first valid one. If any one of them is broken, we pass * them recovery flag back to the caller. */ static int read_raw_super_block(struct f2fs_sb_info *sbi, struct f2fs_super_block **raw_super, int *valid_super_block, int *recovery) { struct super_block *sb = sbi->sb; int block; struct buffer_head *bh; struct f2fs_super_block *super; int err = 0; super = kzalloc(sizeof(struct f2fs_super_block), GFP_KERNEL); if (!super) return -ENOMEM; for (block = 0; block < 2; block++) { bh = sb_bread(sb, block); if (!bh) { f2fs_err(sbi, "Unable to read %dth superblock", block + 1); err = -EIO; *recovery = 1; continue; } /* sanity checking of raw super */ err = sanity_check_raw_super(sbi, bh); if (err) { f2fs_err(sbi, "Can't find valid F2FS filesystem in %dth superblock", block + 1); brelse(bh); *recovery = 1; continue; } if (!*raw_super) { memcpy(super, bh->b_data + F2FS_SUPER_OFFSET, sizeof(*super)); *valid_super_block = block; *raw_super = super; } brelse(bh); } /* No valid superblock */ if (!*raw_super) kfree(super); else err = 0; return err; } int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) { struct buffer_head *bh; __u32 crc = 0; int err; if ((recover && f2fs_readonly(sbi->sb)) || f2fs_hw_is_readonly(sbi)) { set_sbi_flag(sbi, SBI_NEED_SB_WRITE); return -EROFS; } /* we should update superblock crc here */ if (!recover && f2fs_sb_has_sb_chksum(sbi)) { crc = f2fs_crc32(sbi, F2FS_RAW_SUPER(sbi), offsetof(struct f2fs_super_block, crc)); F2FS_RAW_SUPER(sbi)->crc = cpu_to_le32(crc); } /* write back-up superblock first */ bh = sb_bread(sbi->sb, sbi->valid_super_block ? 0 : 1); if (!bh) return -EIO; err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi)); brelse(bh); /* if we are in recovery path, skip writing valid superblock */ if (recover || err) return err; /* write current valid superblock */ bh = sb_bread(sbi->sb, sbi->valid_super_block); if (!bh) return -EIO; err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi)); brelse(bh); return err; } static void save_stop_reason(struct f2fs_sb_info *sbi, unsigned char reason) { unsigned long flags; spin_lock_irqsave(&sbi->error_lock, flags); if (sbi->stop_reason[reason] < GENMASK(BITS_PER_BYTE - 1, 0)) sbi->stop_reason[reason]++; spin_unlock_irqrestore(&sbi->error_lock, flags); } static void f2fs_record_stop_reason(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); unsigned long flags; int err; f2fs_down_write(&sbi->sb_lock); spin_lock_irqsave(&sbi->error_lock, flags); if (sbi->error_dirty) { memcpy(F2FS_RAW_SUPER(sbi)->s_errors, sbi->errors, MAX_F2FS_ERRORS); sbi->error_dirty = false; } memcpy(raw_super->s_stop_reason, sbi->stop_reason, MAX_STOP_REASON); spin_unlock_irqrestore(&sbi->error_lock, flags); err = f2fs_commit_super(sbi, false); f2fs_up_write(&sbi->sb_lock); if (err) f2fs_err(sbi, "f2fs_commit_super fails to record err:%d", err); } void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag) { unsigned long flags; spin_lock_irqsave(&sbi->error_lock, flags); if (!test_bit(flag, (unsigned long *)sbi->errors)) { set_bit(flag, (unsigned long *)sbi->errors); sbi->error_dirty = true; } spin_unlock_irqrestore(&sbi->error_lock, flags); } static bool f2fs_update_errors(struct f2fs_sb_info *sbi) { unsigned long flags; bool need_update = false; spin_lock_irqsave(&sbi->error_lock, flags); if (sbi->error_dirty) { memcpy(F2FS_RAW_SUPER(sbi)->s_errors, sbi->errors, MAX_F2FS_ERRORS); sbi->error_dirty = false; need_update = true; } spin_unlock_irqrestore(&sbi->error_lock, flags); return need_update; } static void f2fs_record_errors(struct f2fs_sb_info *sbi, unsigned char error) { int err; f2fs_down_write(&sbi->sb_lock); if (!f2fs_update_errors(sbi)) goto out_unlock; err = f2fs_commit_super(sbi, false); if (err) f2fs_err(sbi, "f2fs_commit_super fails to record errors:%u, err:%d", error, err); out_unlock: f2fs_up_write(&sbi->sb_lock); } void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error) { f2fs_save_errors(sbi, error); f2fs_record_errors(sbi, error); } void f2fs_handle_error_async(struct f2fs_sb_info *sbi, unsigned char error) { f2fs_save_errors(sbi, error); if (!sbi->error_dirty) return; if (!test_bit(error, (unsigned long *)sbi->errors)) return; schedule_work(&sbi->s_error_work); } static bool system_going_down(void) { return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF || system_state == SYSTEM_RESTART; } void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason, bool irq_context) { struct super_block *sb = sbi->sb; bool shutdown = reason == STOP_CP_REASON_SHUTDOWN; bool continue_fs = !shutdown && F2FS_OPTION(sbi).errors == MOUNT_ERRORS_CONTINUE; set_ckpt_flags(sbi, CP_ERROR_FLAG); if (!f2fs_hw_is_readonly(sbi)) { save_stop_reason(sbi, reason); if (irq_context && !shutdown) schedule_work(&sbi->s_error_work); else f2fs_record_stop_reason(sbi); } /* * We force ERRORS_RO behavior when system is rebooting. Otherwise we * could panic during 'reboot -f' as the underlying device got already * disabled. */ if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_PANIC && !shutdown && !system_going_down() && !is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) panic("F2FS-fs (device %s): panic forced after error\n", sb->s_id); if (shutdown) set_sbi_flag(sbi, SBI_IS_SHUTDOWN); /* continue filesystem operators if errors=continue */ if (continue_fs || f2fs_readonly(sb)) return; f2fs_warn(sbi, "Remounting filesystem read-only"); /* * Make sure updated value of ->s_mount_flags will be visible before * ->s_flags update */ smp_wmb(); sb->s_flags |= SB_RDONLY; } static void f2fs_record_error_work(struct work_struct *work) { struct f2fs_sb_info *sbi = container_of(work, struct f2fs_sb_info, s_error_work); f2fs_record_stop_reason(sbi); } static int f2fs_scan_devices(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); unsigned int max_devices = MAX_DEVICES; unsigned int logical_blksize; blk_mode_t mode = sb_open_mode(sbi->sb->s_flags); int i; /* Initialize single device information */ if (!RDEV(0).path[0]) { if (!bdev_is_zoned(sbi->sb->s_bdev)) return 0; max_devices = 1; } /* * Initialize multiple devices information, or single * zoned block device information. */ sbi->devs = f2fs_kzalloc(sbi, array_size(max_devices, sizeof(struct f2fs_dev_info)), GFP_KERNEL); if (!sbi->devs) return -ENOMEM; logical_blksize = bdev_logical_block_size(sbi->sb->s_bdev); sbi->aligned_blksize = true; for (i = 0; i < max_devices; i++) { if (i == 0) FDEV(0).bdev_handle = sbi->sb->s_bdev_handle; else if (!RDEV(i).path[0]) break; if (max_devices > 1) { /* Multi-device mount */ memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); FDEV(i).total_segments = le32_to_cpu(RDEV(i).total_segments); if (i == 0) { FDEV(i).start_blk = 0; FDEV(i).end_blk = FDEV(i).start_blk + (FDEV(i).total_segments << sbi->log_blocks_per_seg) - 1 + le32_to_cpu(raw_super->segment0_blkaddr); } else { FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; FDEV(i).end_blk = FDEV(i).start_blk + (FDEV(i).total_segments << sbi->log_blocks_per_seg) - 1; FDEV(i).bdev_handle = bdev_open_by_path( FDEV(i).path, mode, sbi->sb, NULL); } } if (IS_ERR(FDEV(i).bdev_handle)) return PTR_ERR(FDEV(i).bdev_handle); FDEV(i).bdev = FDEV(i).bdev_handle->bdev; /* to release errored devices */ sbi->s_ndevs = i + 1; if (logical_blksize != bdev_logical_block_size(FDEV(i).bdev)) sbi->aligned_blksize = false; #ifdef CONFIG_BLK_DEV_ZONED if (bdev_is_zoned(FDEV(i).bdev)) { if (!f2fs_sb_has_blkzoned(sbi)) { f2fs_err(sbi, "Zoned block device feature not enabled"); return -EINVAL; } if (init_blkz_info(sbi, i)) { f2fs_err(sbi, "Failed to initialize F2FS blkzone information"); return -EINVAL; } if (max_devices == 1) break; f2fs_info(sbi, "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: Host-managed)", i, FDEV(i).path, FDEV(i).total_segments, FDEV(i).start_blk, FDEV(i).end_blk); continue; } #endif f2fs_info(sbi, "Mount Device [%2d]: %20s, %8u, %8x - %8x", i, FDEV(i).path, FDEV(i).total_segments, FDEV(i).start_blk, FDEV(i).end_blk); } f2fs_info(sbi, "IO Block Size: %8ld KB", F2FS_IO_SIZE_KB(sbi)); return 0; } static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) { #if IS_ENABLED(CONFIG_UNICODE) if (f2fs_sb_has_casefold(sbi) && !sbi->sb->s_encoding) { const struct f2fs_sb_encodings *encoding_info; struct unicode_map *encoding; __u16 encoding_flags; encoding_info = f2fs_sb_read_encoding(sbi->raw_super); if (!encoding_info) { f2fs_err(sbi, "Encoding requested by superblock is unknown"); return -EINVAL; } encoding_flags = le16_to_cpu(sbi->raw_super->s_encoding_flags); encoding = utf8_load(encoding_info->version); if (IS_ERR(encoding)) { f2fs_err(sbi, "can't mount with superblock charset: %s-%u.%u.%u " "not supported by the kernel. flags: 0x%x.", encoding_info->name, unicode_major(encoding_info->version), unicode_minor(encoding_info->version), unicode_rev(encoding_info->version), encoding_flags); return PTR_ERR(encoding); } f2fs_info(sbi, "Using encoding defined by superblock: " "%s-%u.%u.%u with flags 0x%hx", encoding_info->name, unicode_major(encoding_info->version), unicode_minor(encoding_info->version), unicode_rev(encoding_info->version), encoding_flags); sbi->sb->s_encoding = encoding; sbi->sb->s_encoding_flags = encoding_flags; } #else if (f2fs_sb_has_casefold(sbi)) { f2fs_err(sbi, "Filesystem with casefold feature cannot be mounted without CONFIG_UNICODE"); return -EINVAL; } #endif return 0; } static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) { /* adjust parameters according to the volume size */ if (MAIN_SEGS(sbi) <= SMALL_VOLUME_SEGMENTS) { if (f2fs_block_unit_discard(sbi)) SM_I(sbi)->dcc_info->discard_granularity = MIN_DISCARD_GRANULARITY; if (!f2fs_lfs_mode(sbi)) SM_I(sbi)->ipu_policy = BIT(F2FS_IPU_FORCE) | BIT(F2FS_IPU_HONOR_OPU_WRITE); } sbi->readdir_ra = true; } static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; struct f2fs_super_block *raw_super; struct inode *root; int err; bool skip_recovery = false, need_fsck = false; char *options = NULL; int recovery, i, valid_super_block; struct curseg_info *seg_i; int retry_cnt = 1; #ifdef CONFIG_QUOTA bool quota_enabled = false; #endif try_onemore: err = -EINVAL; raw_super = NULL; valid_super_block = -1; recovery = 0; /* allocate memory for f2fs-specific super block info */ sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; sbi->sb = sb; /* initialize locks within allocated memory */ init_f2fs_rwsem(&sbi->gc_lock); mutex_init(&sbi->writepages); init_f2fs_rwsem(&sbi->cp_global_sem); init_f2fs_rwsem(&sbi->node_write); init_f2fs_rwsem(&sbi->node_change); spin_lock_init(&sbi->stat_lock); init_f2fs_rwsem(&sbi->cp_rwsem); init_f2fs_rwsem(&sbi->quota_sem); init_waitqueue_head(&sbi->cp_wait); spin_lock_init(&sbi->error_lock); for (i = 0; i < NR_INODE_TYPE; i++) { INIT_LIST_HEAD(&sbi->inode_list[i]); spin_lock_init(&sbi->inode_lock[i]); } mutex_init(&sbi->flush_lock); /* Load the checksum driver */ sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0); if (IS_ERR(sbi->s_chksum_driver)) { f2fs_err(sbi, "Cannot load crc32 driver."); err = PTR_ERR(sbi->s_chksum_driver); sbi->s_chksum_driver = NULL; goto free_sbi; } /* set a block size */ if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) { f2fs_err(sbi, "unable to set blocksize"); goto free_sbi; } err = read_raw_super_block(sbi, &raw_super, &valid_super_block, &recovery); if (err) goto free_sbi; sb->s_fs_info = sbi; sbi->raw_super = raw_super; INIT_WORK(&sbi->s_error_work, f2fs_record_error_work); memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS); memcpy(sbi->stop_reason, raw_super->s_stop_reason, MAX_STOP_REASON); /* precompute checksum seed for metadata */ if (f2fs_sb_has_inode_chksum(sbi)) sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, sizeof(raw_super->uuid)); default_options(sbi, false); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); if (data && !options) { err = -ENOMEM; goto free_sb_buf; } err = parse_options(sb, options, false); if (err) goto free_options; sb->s_maxbytes = max_file_blocks(NULL) << le32_to_cpu(raw_super->log_blocksize); sb->s_max_links = F2FS_LINK_MAX; err = f2fs_setup_casefold(sbi); if (err) goto free_options; #ifdef CONFIG_QUOTA sb->dq_op = &f2fs_quota_operations; sb->s_qcop = &f2fs_quotactl_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; if (f2fs_sb_has_quota_ino(sbi)) { for (i = 0; i < MAXQUOTAS; i++) { if (f2fs_qf_ino(sbi->sb, i)) sbi->nquota_files++; } } #endif sb->s_op = &f2fs_sops; #ifdef CONFIG_FS_ENCRYPTION sb->s_cop = &f2fs_cryptops; #endif #ifdef CONFIG_FS_VERITY sb->s_vop = &f2fs_verityops; #endif sb->s_xattr = f2fs_xattr_handlers; sb->s_export_op = &f2fs_export_ops; sb->s_magic = F2FS_SUPER_MAGIC; sb->s_time_gran = 1; sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); memcpy(&sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); sb->s_iflags |= SB_I_CGROUPWB; /* init f2fs-specific super block info */ sbi->valid_super_block = valid_super_block; /* disallow all the data/node/meta page writes */ set_sbi_flag(sbi, SBI_POR_DOING); err = f2fs_init_write_merge_io(sbi); if (err) goto free_bio_info; init_sb_info(sbi); err = f2fs_init_iostat(sbi); if (err) goto free_bio_info; err = init_percpu_info(sbi); if (err) goto free_iostat; if (F2FS_IO_ALIGNED(sbi)) { sbi->write_io_dummy = mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0); if (!sbi->write_io_dummy) { err = -ENOMEM; goto free_percpu; } } /* init per sbi slab cache */ err = f2fs_init_xattr_caches(sbi); if (err) goto free_io_dummy; err = f2fs_init_page_array_cache(sbi); if (err) goto free_xattr_cache; /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); if (IS_ERR(sbi->meta_inode)) { f2fs_err(sbi, "Failed to read F2FS meta data inode"); err = PTR_ERR(sbi->meta_inode); goto free_page_array_cache; } err = f2fs_get_valid_checkpoint(sbi); if (err) { f2fs_err(sbi, "Failed to get valid F2FS checkpoint"); goto free_meta_inode; } if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_QUOTA_NEED_FSCK_FLAG)) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_DISABLED_QUICK_FLAG)) { set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_QUICK_INTERVAL; } if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_FSCK_FLAG)) set_sbi_flag(sbi, SBI_NEED_FSCK); /* Initialize device list */ err = f2fs_scan_devices(sbi); if (err) { f2fs_err(sbi, "Failed to find devices"); goto free_devices; } err = f2fs_init_post_read_wq(sbi); if (err) { f2fs_err(sbi, "Failed to initialize post read workqueue"); goto free_devices; } sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); percpu_counter_set(&sbi->total_valid_inode_count, le32_to_cpu(sbi->ckpt->valid_inode_count)); sbi->user_block_count = le64_to_cpu(sbi->ckpt->user_block_count); sbi->total_valid_block_count = le64_to_cpu(sbi->ckpt->valid_block_count); sbi->last_valid_block_count = sbi->total_valid_block_count; sbi->reserved_blocks = 0; sbi->current_reserved_blocks = 0; limit_reserve_root(sbi); adjust_unusable_cap_perc(sbi); f2fs_init_extent_cache_info(sbi); f2fs_init_ino_entry_info(sbi); f2fs_init_fsync_node_info(sbi); /* setup checkpoint request control and start checkpoint issue thread */ f2fs_init_ckpt_req_control(sbi); if (!f2fs_readonly(sb) && !test_opt(sbi, DISABLE_CHECKPOINT) && test_opt(sbi, MERGE_CHECKPOINT)) { err = f2fs_start_ckpt_thread(sbi); if (err) { f2fs_err(sbi, "Failed to start F2FS issue_checkpoint_thread (%d)", err); goto stop_ckpt_thread; } } /* setup f2fs internal modules */ err = f2fs_build_segment_manager(sbi); if (err) { f2fs_err(sbi, "Failed to initialize F2FS segment manager (%d)", err); goto free_sm; } err = f2fs_build_node_manager(sbi); if (err) { f2fs_err(sbi, "Failed to initialize F2FS node manager (%d)", err); goto free_nm; } err = adjust_reserved_segment(sbi); if (err) goto free_nm; /* For write statistics */ sbi->sectors_written_start = f2fs_get_sectors_written(sbi); /* Read accumulated write IO statistics if exists */ seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); if (__exist_node_summaries(sbi)) sbi->kbytes_written = le64_to_cpu(seg_i->journal->info.kbytes_written); f2fs_build_gc_manager(sbi); err = f2fs_build_stats(sbi); if (err) goto free_nm; /* get an inode for node space */ sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi)); if (IS_ERR(sbi->node_inode)) { f2fs_err(sbi, "Failed to read node inode"); err = PTR_ERR(sbi->node_inode); goto free_stats; } /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); if (IS_ERR(root)) { f2fs_err(sbi, "Failed to read root inode"); err = PTR_ERR(root); goto free_node_inode; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size || !root->i_nlink) { iput(root); err = -EINVAL; goto free_node_inode; } sb->s_root = d_make_root(root); /* allocate root dentry */ if (!sb->s_root) { err = -ENOMEM; goto free_node_inode; } err = f2fs_init_compress_inode(sbi); if (err) goto free_root_inode; err = f2fs_register_sysfs(sbi); if (err) goto free_compress_inode; #ifdef CONFIG_QUOTA /* Enable quota usage during mount */ if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb)) { err = f2fs_enable_quotas(sb); if (err) f2fs_err(sbi, "Cannot turn on quotas: error %d", err); } quota_enabled = f2fs_recover_quota_begin(sbi); #endif /* if there are any orphan inodes, free them */ err = f2fs_recover_orphan_inodes(sbi); if (err) goto free_meta; if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG))) goto reset_checkpoint; /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD) && !test_opt(sbi, NORECOVERY)) { /* * mount should be failed, when device has readonly mode, and * previous checkpoint was not done by clean system shutdown. */ if (f2fs_hw_is_readonly(sbi)) { if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { err = f2fs_recover_fsync_data(sbi, true); if (err > 0) { err = -EROFS; f2fs_err(sbi, "Need to recover fsync data, but " "write access unavailable, please try " "mount w/ disable_roll_forward or norecovery"); } if (err < 0) goto free_meta; } f2fs_info(sbi, "write access unavailable, skipping recovery"); goto reset_checkpoint; } if (need_fsck) set_sbi_flag(sbi, SBI_NEED_FSCK); if (skip_recovery) goto reset_checkpoint; err = f2fs_recover_fsync_data(sbi, false); if (err < 0) { if (err != -ENOMEM) skip_recovery = true; need_fsck = true; f2fs_err(sbi, "Cannot recover all fsync data errno=%d", err); goto free_meta; } } else { err = f2fs_recover_fsync_data(sbi, true); if (!f2fs_readonly(sb) && err > 0) { err = -EINVAL; f2fs_err(sbi, "Need to recover fsync data"); goto free_meta; } } #ifdef CONFIG_QUOTA f2fs_recover_quota_end(sbi, quota_enabled); #endif reset_checkpoint: /* * If the f2fs is not readonly and fsync data recovery succeeds, * check zoned block devices' write pointer consistency. */ if (!err && !f2fs_readonly(sb) && f2fs_sb_has_blkzoned(sbi)) { err = f2fs_check_write_pointer(sbi); if (err) goto free_meta; } f2fs_init_inmem_curseg(sbi); /* f2fs_recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); if (test_opt(sbi, DISABLE_CHECKPOINT)) { err = f2fs_disable_checkpoint(sbi); if (err) goto sync_free_meta; } else if (is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)) { f2fs_enable_checkpoint(sbi); } /* * If filesystem is not mounted as read-only then * do start the gc_thread. */ if ((F2FS_OPTION(sbi).bggc_mode != BGGC_MODE_OFF || test_opt(sbi, GC_MERGE)) && !f2fs_readonly(sb)) { /* After POR, we can run background GC thread.*/ err = f2fs_start_gc_thread(sbi); if (err) goto sync_free_meta; } kvfree(options); /* recover broken superblock */ if (recovery) { err = f2fs_commit_super(sbi, true); f2fs_info(sbi, "Try to recover %dth superblock, ret: %d", sbi->valid_super_block ? 1 : 2, err); } f2fs_join_shrinker(sbi); f2fs_tuning_parameters(sbi); f2fs_notice(sbi, "Mounted with checkpoint version = %llx", cur_cp_version(F2FS_CKPT(sbi))); f2fs_update_time(sbi, CP_TIME); f2fs_update_time(sbi, REQ_TIME); clear_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); return 0; sync_free_meta: /* safe to flush all the data */ sync_filesystem(sbi->sb); retry_cnt = 0; free_meta: #ifdef CONFIG_QUOTA f2fs_truncate_quota_inode_pages(sb); if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb)) f2fs_quota_off_umount(sbi->sb); #endif /* * Some dirty meta pages can be produced by f2fs_recover_orphan_inodes() * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() * followed by f2fs_write_checkpoint() through f2fs_write_node_pages(), which * falls into an infinite loop in f2fs_sync_meta_pages(). */ truncate_inode_pages_final(META_MAPPING(sbi)); /* evict some inodes being cached by GC */ evict_inodes(sb); f2fs_unregister_sysfs(sbi); free_compress_inode: f2fs_destroy_compress_inode(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; free_node_inode: f2fs_release_ino_entry(sbi, true); truncate_inode_pages_final(NODE_MAPPING(sbi)); iput(sbi->node_inode); sbi->node_inode = NULL; free_stats: f2fs_destroy_stats(sbi); free_nm: /* stop discard thread before destroying node manager */ f2fs_stop_discard_thread(sbi); f2fs_destroy_node_manager(sbi); free_sm: f2fs_destroy_segment_manager(sbi); stop_ckpt_thread: f2fs_stop_ckpt_thread(sbi); /* flush s_error_work before sbi destroy */ flush_work(&sbi->s_error_work); f2fs_destroy_post_read_wq(sbi); free_devices: destroy_device_list(sbi); kvfree(sbi->ckpt); free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); sbi->meta_inode = NULL; free_page_array_cache: f2fs_destroy_page_array_cache(sbi); free_xattr_cache: f2fs_destroy_xattr_caches(sbi); free_io_dummy: mempool_destroy(sbi->write_io_dummy); free_percpu: destroy_percpu_info(sbi); free_iostat: f2fs_destroy_iostat(sbi); free_bio_info: for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); sb->s_encoding = NULL; #endif free_options: #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) kfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy); kvfree(options); free_sb_buf: kfree(raw_super); free_sbi: if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi); /* give only one another chance */ if (retry_cnt > 0 && skip_recovery) { retry_cnt--; shrink_dcache_sb(sb); goto try_onemore; } return err; } static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super); } static void kill_f2fs_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); if (sb->s_root) { set_sbi_flag(sbi, SBI_IS_CLOSE); f2fs_stop_gc_thread(sbi); f2fs_stop_discard_thread(sbi); #ifdef CONFIG_F2FS_FS_COMPRESSION /* * latter evict_inode() can bypass checking and invalidating * compress inode cache. */ if (test_opt(sbi, COMPRESS_CACHE)) truncate_inode_pages_final(COMPRESS_MAPPING(sbi)); #endif if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) || !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { struct cp_control cpc = { .reason = CP_UMOUNT, }; stat_inc_cp_call_count(sbi, TOTAL_CALL); f2fs_write_checkpoint(sbi, &cpc); } if (is_sbi_flag_set(sbi, SBI_IS_RECOVERED) && f2fs_readonly(sb)) sb->s_flags &= ~SB_RDONLY; } kill_block_super(sb); /* Release block devices last, after fscrypt_destroy_keyring(). */ if (sbi) { destroy_device_list(sbi); kfree(sbi); sb->s_fs_info = NULL; } } static struct file_system_type f2fs_fs_type = { .owner = THIS_MODULE, .name = "f2fs", .mount = f2fs_mount, .kill_sb = kill_f2fs_super, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("f2fs"); static int __init init_inodecache(void) { f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache", sizeof(struct f2fs_inode_info), 0, SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL); return f2fs_inode_cachep ? 0 : -ENOMEM; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(f2fs_inode_cachep); } static int __init init_f2fs_fs(void) { int err; if (PAGE_SIZE != F2FS_BLKSIZE) { printk("F2FS not supported on PAGE_SIZE(%lu) != BLOCK_SIZE(%lu)\n", PAGE_SIZE, F2FS_BLKSIZE); return -EINVAL; } err = init_inodecache(); if (err) goto fail; err = f2fs_create_node_manager_caches(); if (err) goto free_inodecache; err = f2fs_create_segment_manager_caches(); if (err) goto free_node_manager_caches; err = f2fs_create_checkpoint_caches(); if (err) goto free_segment_manager_caches; err = f2fs_create_recovery_cache(); if (err) goto free_checkpoint_caches; err = f2fs_create_extent_cache(); if (err) goto free_recovery_cache; err = f2fs_create_garbage_collection_cache(); if (err) goto free_extent_cache; err = f2fs_init_sysfs(); if (err) goto free_garbage_collection_cache; err = f2fs_init_shrinker(); if (err) goto free_sysfs; err = register_filesystem(&f2fs_fs_type); if (err) goto free_shrinker; f2fs_create_root_stats(); err = f2fs_init_post_read_processing(); if (err) goto free_root_stats; err = f2fs_init_iostat_processing(); if (err) goto free_post_read; err = f2fs_init_bio_entry_cache(); if (err) goto free_iostat; err = f2fs_init_bioset(); if (err) goto free_bio_entry_cache; err = f2fs_init_compress_mempool(); if (err) goto free_bioset; err = f2fs_init_compress_cache(); if (err) goto free_compress_mempool; err = f2fs_create_casefold_cache(); if (err) goto free_compress_cache; return 0; free_compress_cache: f2fs_destroy_compress_cache(); free_compress_mempool: f2fs_destroy_compress_mempool(); free_bioset: f2fs_destroy_bioset(); free_bio_entry_cache: f2fs_destroy_bio_entry_cache(); free_iostat: f2fs_destroy_iostat_processing(); free_post_read: f2fs_destroy_post_read_processing(); free_root_stats: f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); free_shrinker: f2fs_exit_shrinker(); free_sysfs: f2fs_exit_sysfs(); free_garbage_collection_cache: f2fs_destroy_garbage_collection_cache(); free_extent_cache: f2fs_destroy_extent_cache(); free_recovery_cache: f2fs_destroy_recovery_cache(); free_checkpoint_caches: f2fs_destroy_checkpoint_caches(); free_segment_manager_caches: f2fs_destroy_segment_manager_caches(); free_node_manager_caches: f2fs_destroy_node_manager_caches(); free_inodecache: destroy_inodecache(); fail: return err; } static void __exit exit_f2fs_fs(void) { f2fs_destroy_casefold_cache(); f2fs_destroy_compress_cache(); f2fs_destroy_compress_mempool(); f2fs_destroy_bioset(); f2fs_destroy_bio_entry_cache(); f2fs_destroy_iostat_processing(); f2fs_destroy_post_read_processing(); f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); f2fs_exit_shrinker(); f2fs_exit_sysfs(); f2fs_destroy_garbage_collection_cache(); f2fs_destroy_extent_cache(); f2fs_destroy_recovery_cache(); f2fs_destroy_checkpoint_caches(); f2fs_destroy_segment_manager_caches(); f2fs_destroy_node_manager_caches(); destroy_inodecache(); } module_init(init_f2fs_fs) module_exit(exit_f2fs_fs) MODULE_AUTHOR("Samsung Electronics's Praesto Team"); MODULE_DESCRIPTION("Flash Friendly File System"); MODULE_LICENSE("GPL"); MODULE_SOFTDEP("pre: crc32");
5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 /* * llc_pdu.c - access to PDU internals * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/netdevice.h> #include <net/llc_pdu.h> static void llc_pdu_decode_pdu_type(struct sk_buff *skb, u8 *type); static u8 llc_pdu_get_pf_bit(struct llc_pdu_sn *pdu); void llc_pdu_set_cmd_rsp(struct sk_buff *skb, u8 pdu_type) { llc_pdu_un_hdr(skb)->ssap |= pdu_type; } /** * llc_pdu_set_pf_bit - sets poll/final bit in LLC header * @skb: Frame to set bit in * @bit_value: poll/final bit (0 or 1). * * This function sets poll/final bit in LLC header (based on type of PDU). * in I or S pdus, p/f bit is right bit of fourth byte in header. in U * pdus p/f bit is fifth bit of third byte. */ void llc_pdu_set_pf_bit(struct sk_buff *skb, u8 bit_value) { u8 pdu_type; struct llc_pdu_sn *pdu; llc_pdu_decode_pdu_type(skb, &pdu_type); pdu = llc_pdu_sn_hdr(skb); switch (pdu_type) { case LLC_PDU_TYPE_I: case LLC_PDU_TYPE_S: pdu->ctrl_2 = (pdu->ctrl_2 & 0xFE) | bit_value; break; case LLC_PDU_TYPE_U: pdu->ctrl_1 |= (pdu->ctrl_1 & 0xEF) | (bit_value << 4); break; } } /** * llc_pdu_decode_pf_bit - extracs poll/final bit from LLC header * @skb: input skb that p/f bit must be extracted from it * @pf_bit: poll/final bit (0 or 1) * * This function extracts poll/final bit from LLC header (based on type of * PDU). In I or S pdus, p/f bit is right bit of fourth byte in header. In * U pdus p/f bit is fifth bit of third byte. */ void llc_pdu_decode_pf_bit(struct sk_buff *skb, u8 *pf_bit) { u8 pdu_type; struct llc_pdu_sn *pdu; llc_pdu_decode_pdu_type(skb, &pdu_type); pdu = llc_pdu_sn_hdr(skb); switch (pdu_type) { case LLC_PDU_TYPE_I: case LLC_PDU_TYPE_S: *pf_bit = pdu->ctrl_2 & LLC_S_PF_BIT_MASK; break; case LLC_PDU_TYPE_U: *pf_bit = (pdu->ctrl_1 & LLC_U_PF_BIT_MASK) >> 4; break; } } /** * llc_pdu_init_as_disc_cmd - Builds DISC PDU * @skb: Address of the skb to build * @p_bit: The P bit to set in the PDU * * Builds a pdu frame as a DISC command. */ void llc_pdu_init_as_disc_cmd(struct sk_buff *skb, u8 p_bit) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_2_PDU_CMD_DISC; pdu->ctrl_1 |= ((p_bit & 1) << 4) & LLC_U_PF_BIT_MASK; } /** * llc_pdu_init_as_i_cmd - builds I pdu * @skb: Address of the skb to build * @p_bit: The P bit to set in the PDU * @ns: The sequence number of the data PDU * @nr: The seq. number of the expected I PDU from the remote * * Builds a pdu frame as an I command. */ void llc_pdu_init_as_i_cmd(struct sk_buff *skb, u8 p_bit, u8 ns, u8 nr) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_I; pdu->ctrl_2 = 0; pdu->ctrl_2 |= (p_bit & LLC_I_PF_BIT_MASK); /* p/f bit */ pdu->ctrl_1 |= (ns << 1) & 0xFE; /* set N(S) in bits 2..8 */ pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ } /** * llc_pdu_init_as_rej_cmd - builds REJ PDU * @skb: Address of the skb to build * @p_bit: The P bit to set in the PDU * @nr: The seq. number of the expected I PDU from the remote * * Builds a pdu frame as a REJ command. */ void llc_pdu_init_as_rej_cmd(struct sk_buff *skb, u8 p_bit, u8 nr) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_S; pdu->ctrl_1 |= LLC_2_PDU_CMD_REJ; pdu->ctrl_2 = 0; pdu->ctrl_2 |= p_bit & LLC_S_PF_BIT_MASK; pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */ pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ } /** * llc_pdu_init_as_rnr_cmd - builds RNR pdu * @skb: Address of the skb to build * @p_bit: The P bit to set in the PDU * @nr: The seq. number of the expected I PDU from the remote * * Builds a pdu frame as an RNR command. */ void llc_pdu_init_as_rnr_cmd(struct sk_buff *skb, u8 p_bit, u8 nr) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_S; pdu->ctrl_1 |= LLC_2_PDU_CMD_RNR; pdu->ctrl_2 = 0; pdu->ctrl_2 |= p_bit & LLC_S_PF_BIT_MASK; pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */ pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ } /** * llc_pdu_init_as_rr_cmd - Builds RR pdu * @skb: Address of the skb to build * @p_bit: The P bit to set in the PDU * @nr: The seq. number of the expected I PDU from the remote * * Builds a pdu frame as an RR command. */ void llc_pdu_init_as_rr_cmd(struct sk_buff *skb, u8 p_bit, u8 nr) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_S; pdu->ctrl_1 |= LLC_2_PDU_CMD_RR; pdu->ctrl_2 = p_bit & LLC_S_PF_BIT_MASK; pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */ pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ } /** * llc_pdu_init_as_sabme_cmd - builds SABME pdu * @skb: Address of the skb to build * @p_bit: The P bit to set in the PDU * * Builds a pdu frame as an SABME command. */ void llc_pdu_init_as_sabme_cmd(struct sk_buff *skb, u8 p_bit) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_2_PDU_CMD_SABME; pdu->ctrl_1 |= ((p_bit & 1) << 4) & LLC_U_PF_BIT_MASK; } /** * llc_pdu_init_as_dm_rsp - builds DM response pdu * @skb: Address of the skb to build * @f_bit: The F bit to set in the PDU * * Builds a pdu frame as a DM response. */ void llc_pdu_init_as_dm_rsp(struct sk_buff *skb, u8 f_bit) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_2_PDU_RSP_DM; pdu->ctrl_1 |= ((f_bit & 1) << 4) & LLC_U_PF_BIT_MASK; } /** * llc_pdu_init_as_frmr_rsp - builds FRMR response PDU * @skb: Address of the frame to build * @prev_pdu: The rejected PDU frame * @f_bit: The F bit to set in the PDU * @vs: tx state vari value for the data link conn at the rejecting LLC * @vr: rx state var value for the data link conn at the rejecting LLC * @vzyxw: completely described in the IEEE Std 802.2 document (Pg 55) * * Builds a pdu frame as a FRMR response. */ void llc_pdu_init_as_frmr_rsp(struct sk_buff *skb, struct llc_pdu_sn *prev_pdu, u8 f_bit, u8 vs, u8 vr, u8 vzyxw) { struct llc_frmr_info *frmr_info; u8 prev_pf = 0; u8 *ctrl; struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_2_PDU_RSP_FRMR; pdu->ctrl_1 |= ((f_bit & 1) << 4) & LLC_U_PF_BIT_MASK; frmr_info = (struct llc_frmr_info *)&pdu->ctrl_2; ctrl = (u8 *)&prev_pdu->ctrl_1; FRMR_INFO_SET_REJ_CNTRL(frmr_info,ctrl); FRMR_INFO_SET_Vs(frmr_info, vs); FRMR_INFO_SET_Vr(frmr_info, vr); prev_pf = llc_pdu_get_pf_bit(prev_pdu); FRMR_INFO_SET_C_R_BIT(frmr_info, prev_pf); FRMR_INFO_SET_INVALID_PDU_CTRL_IND(frmr_info, vzyxw); FRMR_INFO_SET_INVALID_PDU_INFO_IND(frmr_info, vzyxw); FRMR_INFO_SET_PDU_INFO_2LONG_IND(frmr_info, vzyxw); FRMR_INFO_SET_PDU_INVALID_Nr_IND(frmr_info, vzyxw); FRMR_INFO_SET_PDU_INVALID_Ns_IND(frmr_info, vzyxw); skb_put(skb, sizeof(struct llc_frmr_info)); } /** * llc_pdu_init_as_rr_rsp - builds RR response pdu * @skb: Address of the skb to build * @f_bit: The F bit to set in the PDU * @nr: The seq. number of the expected data PDU from the remote * * Builds a pdu frame as an RR response. */ void llc_pdu_init_as_rr_rsp(struct sk_buff *skb, u8 f_bit, u8 nr) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_S; pdu->ctrl_1 |= LLC_2_PDU_RSP_RR; pdu->ctrl_2 = 0; pdu->ctrl_2 |= f_bit & LLC_S_PF_BIT_MASK; pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */ pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ } /** * llc_pdu_init_as_rej_rsp - builds REJ response pdu * @skb: Address of the skb to build * @f_bit: The F bit to set in the PDU * @nr: The seq. number of the expected data PDU from the remote * * Builds a pdu frame as a REJ response. */ void llc_pdu_init_as_rej_rsp(struct sk_buff *skb, u8 f_bit, u8 nr) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_S; pdu->ctrl_1 |= LLC_2_PDU_RSP_REJ; pdu->ctrl_2 = 0; pdu->ctrl_2 |= f_bit & LLC_S_PF_BIT_MASK; pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */ pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ } /** * llc_pdu_init_as_rnr_rsp - builds RNR response pdu * @skb: Address of the frame to build * @f_bit: The F bit to set in the PDU * @nr: The seq. number of the expected data PDU from the remote * * Builds a pdu frame as an RNR response. */ void llc_pdu_init_as_rnr_rsp(struct sk_buff *skb, u8 f_bit, u8 nr) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_S; pdu->ctrl_1 |= LLC_2_PDU_RSP_RNR; pdu->ctrl_2 = 0; pdu->ctrl_2 |= f_bit & LLC_S_PF_BIT_MASK; pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */ pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ } /** * llc_pdu_init_as_ua_rsp - builds UA response pdu * @skb: Address of the frame to build * @f_bit: The F bit to set in the PDU * * Builds a pdu frame as a UA response. */ void llc_pdu_init_as_ua_rsp(struct sk_buff *skb, u8 f_bit) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_2_PDU_RSP_UA; pdu->ctrl_1 |= ((f_bit & 1) << 4) & LLC_U_PF_BIT_MASK; } /** * llc_pdu_decode_pdu_type - designates PDU type * @skb: input skb that type of it must be designated. * @type: type of PDU (output argument). * * This function designates type of PDU (I, S or U). */ static void llc_pdu_decode_pdu_type(struct sk_buff *skb, u8 *type) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); if (pdu->ctrl_1 & 1) { if ((pdu->ctrl_1 & LLC_PDU_TYPE_U) == LLC_PDU_TYPE_U) *type = LLC_PDU_TYPE_U; else *type = LLC_PDU_TYPE_S; } else *type = LLC_PDU_TYPE_I; } /** * llc_pdu_get_pf_bit - extracts p/f bit of input PDU * @pdu: pointer to LLC header. * * This function extracts p/f bit of input PDU. at first examines type of * PDU and then extracts p/f bit. Returns the p/f bit. */ static u8 llc_pdu_get_pf_bit(struct llc_pdu_sn *pdu) { u8 pdu_type; u8 pf_bit = 0; if (pdu->ctrl_1 & 1) { if ((pdu->ctrl_1 & LLC_PDU_TYPE_U) == LLC_PDU_TYPE_U) pdu_type = LLC_PDU_TYPE_U; else pdu_type = LLC_PDU_TYPE_S; } else pdu_type = LLC_PDU_TYPE_I; switch (pdu_type) { case LLC_PDU_TYPE_I: case LLC_PDU_TYPE_S: pf_bit = pdu->ctrl_2 & LLC_S_PF_BIT_MASK; break; case LLC_PDU_TYPE_U: pf_bit = (pdu->ctrl_1 & LLC_U_PF_BIT_MASK) >> 4; break; } return pf_bit; }
6 48 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Global definitions for the ARP (RFC 826) protocol. * * Version: @(#)if_arp.h 1.0.1 04/16/93 * * Authors: Original taken from Berkeley UNIX 4.3, (c) UCB 1986-1988 * Portions taken from the KA9Q/NOS (v2.00m PA0GRI) source. * Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Florian La Roche, * Jonathan Layes <layes@loran.com> * Arnaldo Carvalho de Melo <acme@conectiva.com.br> ARPHRD_HWX25 */ #ifndef _LINUX_IF_ARP_H #define _LINUX_IF_ARP_H #include <linux/skbuff.h> #include <uapi/linux/if_arp.h> static inline struct arphdr *arp_hdr(const struct sk_buff *skb) { return (struct arphdr *)skb_network_header(skb); } static inline unsigned int arp_hdr_len(const struct net_device *dev) { switch (dev->type) { #if IS_ENABLED(CONFIG_FIREWIRE_NET) case ARPHRD_IEEE1394: /* ARP header, device address and 2 IP addresses */ return sizeof(struct arphdr) + dev->addr_len + sizeof(u32) * 2; #endif default: /* ARP header, plus 2 device addresses, plus 2 IP addresses. */ return sizeof(struct arphdr) + (dev->addr_len + sizeof(u32)) * 2; } } static inline bool dev_is_mac_header_xmit(const struct net_device *dev) { switch (dev->type) { case ARPHRD_TUNNEL: case ARPHRD_TUNNEL6: case ARPHRD_SIT: case ARPHRD_IPGRE: case ARPHRD_IP6GRE: case ARPHRD_VOID: case ARPHRD_NONE: case ARPHRD_RAWIP: case ARPHRD_PIMREG: /* PPP adds its l2 header automatically in ppp_start_xmit(). * This makes it look like an l3 device to __bpf_redirect() and tcf_mirred_init(). */ case ARPHRD_PPP: return false; default: return true; } } #endif /* _LINUX_IF_ARP_H */
1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 // SPDX-License-Identifier: GPL-2.0-or-later /* * Jeilinj subdriver * * Supports some Jeilin dual-mode cameras which use bulk transport and * download raw JPEG data. * * Copyright (C) 2009 Theodore Kilgore * * Sportscam DV15 support and control settings are * Copyright (C) 2011 Patrice Chotard */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define MODULE_NAME "jeilinj" #include <linux/slab.h> #include "gspca.h" #include "jpeg.h" MODULE_AUTHOR("Theodore Kilgore <kilgota@auburn.edu>"); MODULE_DESCRIPTION("GSPCA/JEILINJ USB Camera Driver"); MODULE_LICENSE("GPL"); /* Default timeouts, in ms */ #define JEILINJ_CMD_TIMEOUT 500 #define JEILINJ_CMD_DELAY 160 #define JEILINJ_DATA_TIMEOUT 1000 /* Maximum transfer size to use. */ #define JEILINJ_MAX_TRANSFER 0x200 #define FRAME_HEADER_LEN 0x10 #define FRAME_START 0xFFFFFFFF enum { SAKAR_57379, SPORTSCAM_DV15, }; #define CAMQUALITY_MIN 0 /* highest cam quality */ #define CAMQUALITY_MAX 97 /* lowest cam quality */ /* Structure to hold all of our device specific stuff */ struct sd { struct gspca_dev gspca_dev; /* !! must be the first item */ int blocks_left; const struct v4l2_pix_format *cap_mode; struct v4l2_ctrl *freq; struct v4l2_ctrl *jpegqual; /* Driver stuff */ u8 type; u8 quality; /* image quality */ #define QUALITY_MIN 35 #define QUALITY_MAX 85 #define QUALITY_DEF 85 u8 jpeg_hdr[JPEG_HDR_SZ]; }; struct jlj_command { unsigned char instruction[2]; unsigned char ack_wanted; unsigned char delay; }; /* AFAICT these cameras will only do 320x240. */ static struct v4l2_pix_format jlj_mode[] = { { 320, 240, V4L2_PIX_FMT_JPEG, V4L2_FIELD_NONE, .bytesperline = 320, .sizeimage = 320 * 240, .colorspace = V4L2_COLORSPACE_JPEG, .priv = 0}, { 640, 480, V4L2_PIX_FMT_JPEG, V4L2_FIELD_NONE, .bytesperline = 640, .sizeimage = 640 * 480, .colorspace = V4L2_COLORSPACE_JPEG, .priv = 0} }; /* * cam uses endpoint 0x03 to send commands, 0x84 for read commands, * and 0x82 for bulk transfer. */ /* All commands are two bytes only */ static void jlj_write2(struct gspca_dev *gspca_dev, unsigned char *command) { int retval; if (gspca_dev->usb_err < 0) return; memcpy(gspca_dev->usb_buf, command, 2); retval = usb_bulk_msg(gspca_dev->dev, usb_sndbulkpipe(gspca_dev->dev, 3), gspca_dev->usb_buf, 2, NULL, 500); if (retval < 0) { pr_err("command write [%02x] error %d\n", gspca_dev->usb_buf[0], retval); gspca_dev->usb_err = retval; } } /* Responses are one byte only */ static void jlj_read1(struct gspca_dev *gspca_dev, unsigned char *response) { int retval; if (gspca_dev->usb_err < 0) return; retval = usb_bulk_msg(gspca_dev->dev, usb_rcvbulkpipe(gspca_dev->dev, 0x84), gspca_dev->usb_buf, 1, NULL, 500); *response = gspca_dev->usb_buf[0]; if (retval < 0) { pr_err("read command [%02x] error %d\n", gspca_dev->usb_buf[0], retval); gspca_dev->usb_err = retval; } } static void setfreq(struct gspca_dev *gspca_dev, s32 val) { u8 freq_commands[][2] = { {0x71, 0x80}, {0x70, 0x07} }; freq_commands[0][1] |= val >> 1; jlj_write2(gspca_dev, freq_commands[0]); jlj_write2(gspca_dev, freq_commands[1]); } static void setcamquality(struct gspca_dev *gspca_dev, s32 val) { u8 quality_commands[][2] = { {0x71, 0x1E}, {0x70, 0x06} }; u8 camquality; /* adapt camera quality from jpeg quality */ camquality = ((QUALITY_MAX - val) * CAMQUALITY_MAX) / (QUALITY_MAX - QUALITY_MIN); quality_commands[0][1] += camquality; jlj_write2(gspca_dev, quality_commands[0]); jlj_write2(gspca_dev, quality_commands[1]); } static void setautogain(struct gspca_dev *gspca_dev, s32 val) { u8 autogain_commands[][2] = { {0x94, 0x02}, {0xcf, 0x00} }; autogain_commands[1][1] = val << 4; jlj_write2(gspca_dev, autogain_commands[0]); jlj_write2(gspca_dev, autogain_commands[1]); } static void setred(struct gspca_dev *gspca_dev, s32 val) { u8 setred_commands[][2] = { {0x94, 0x02}, {0xe6, 0x00} }; setred_commands[1][1] = val; jlj_write2(gspca_dev, setred_commands[0]); jlj_write2(gspca_dev, setred_commands[1]); } static void setgreen(struct gspca_dev *gspca_dev, s32 val) { u8 setgreen_commands[][2] = { {0x94, 0x02}, {0xe7, 0x00} }; setgreen_commands[1][1] = val; jlj_write2(gspca_dev, setgreen_commands[0]); jlj_write2(gspca_dev, setgreen_commands[1]); } static void setblue(struct gspca_dev *gspca_dev, s32 val) { u8 setblue_commands[][2] = { {0x94, 0x02}, {0xe9, 0x00} }; setblue_commands[1][1] = val; jlj_write2(gspca_dev, setblue_commands[0]); jlj_write2(gspca_dev, setblue_commands[1]); } static int jlj_start(struct gspca_dev *gspca_dev) { int i; int start_commands_size; u8 response = 0xff; struct sd *sd = (struct sd *) gspca_dev; struct jlj_command start_commands[] = { {{0x71, 0x81}, 0, 0}, {{0x70, 0x05}, 0, JEILINJ_CMD_DELAY}, {{0x95, 0x70}, 1, 0}, {{0x71, 0x81 - gspca_dev->curr_mode}, 0, 0}, {{0x70, 0x04}, 0, JEILINJ_CMD_DELAY}, {{0x95, 0x70}, 1, 0}, {{0x71, 0x00}, 0, 0}, /* start streaming ??*/ {{0x70, 0x08}, 0, JEILINJ_CMD_DELAY}, {{0x95, 0x70}, 1, 0}, #define SPORTSCAM_DV15_CMD_SIZE 9 {{0x94, 0x02}, 0, 0}, {{0xde, 0x24}, 0, 0}, {{0x94, 0x02}, 0, 0}, {{0xdd, 0xf0}, 0, 0}, {{0x94, 0x02}, 0, 0}, {{0xe3, 0x2c}, 0, 0}, {{0x94, 0x02}, 0, 0}, {{0xe4, 0x00}, 0, 0}, {{0x94, 0x02}, 0, 0}, {{0xe5, 0x00}, 0, 0}, {{0x94, 0x02}, 0, 0}, {{0xe6, 0x2c}, 0, 0}, {{0x94, 0x03}, 0, 0}, {{0xaa, 0x00}, 0, 0} }; sd->blocks_left = 0; /* Under Windows, USB spy shows that only the 9 first start * commands are used for SPORTSCAM_DV15 webcam */ if (sd->type == SPORTSCAM_DV15) start_commands_size = SPORTSCAM_DV15_CMD_SIZE; else start_commands_size = ARRAY_SIZE(start_commands); for (i = 0; i < start_commands_size; i++) { jlj_write2(gspca_dev, start_commands[i].instruction); if (start_commands[i].delay) msleep(start_commands[i].delay); if (start_commands[i].ack_wanted) jlj_read1(gspca_dev, &response); } setcamquality(gspca_dev, v4l2_ctrl_g_ctrl(sd->jpegqual)); msleep(2); setfreq(gspca_dev, v4l2_ctrl_g_ctrl(sd->freq)); if (gspca_dev->usb_err < 0) gspca_err(gspca_dev, "Start streaming command failed\n"); return gspca_dev->usb_err; } static void sd_pkt_scan(struct gspca_dev *gspca_dev, u8 *data, int len) { struct sd *sd = (struct sd *) gspca_dev; int packet_type; u32 header_marker; gspca_dbg(gspca_dev, D_STREAM, "Got %d bytes out of %d for Block 0\n", len, JEILINJ_MAX_TRANSFER); if (len != JEILINJ_MAX_TRANSFER) { gspca_dbg(gspca_dev, D_PACK, "bad length\n"); goto discard; } /* check if it's start of frame */ header_marker = ((u32 *)data)[0]; if (header_marker == FRAME_START) { sd->blocks_left = data[0x0a] - 1; gspca_dbg(gspca_dev, D_STREAM, "blocks_left = 0x%x\n", sd->blocks_left); /* Start a new frame, and add the JPEG header, first thing */ gspca_frame_add(gspca_dev, FIRST_PACKET, sd->jpeg_hdr, JPEG_HDR_SZ); /* Toss line 0 of data block 0, keep the rest. */ gspca_frame_add(gspca_dev, INTER_PACKET, data + FRAME_HEADER_LEN, JEILINJ_MAX_TRANSFER - FRAME_HEADER_LEN); } else if (sd->blocks_left > 0) { gspca_dbg(gspca_dev, D_STREAM, "%d blocks remaining for frame\n", sd->blocks_left); sd->blocks_left -= 1; if (sd->blocks_left == 0) packet_type = LAST_PACKET; else packet_type = INTER_PACKET; gspca_frame_add(gspca_dev, packet_type, data, JEILINJ_MAX_TRANSFER); } else goto discard; return; discard: /* Discard data until a new frame starts. */ gspca_dev->last_packet_type = DISCARD_PACKET; } /* This function is called at probe time just before sd_init */ static int sd_config(struct gspca_dev *gspca_dev, const struct usb_device_id *id) { struct cam *cam = &gspca_dev->cam; struct sd *dev = (struct sd *) gspca_dev; dev->type = id->driver_info; dev->quality = QUALITY_DEF; cam->cam_mode = jlj_mode; cam->nmodes = ARRAY_SIZE(jlj_mode); cam->bulk = 1; cam->bulk_nurbs = 1; cam->bulk_size = JEILINJ_MAX_TRANSFER; return 0; } static void sd_stopN(struct gspca_dev *gspca_dev) { int i; u8 *buf; static u8 stop_commands[][2] = { {0x71, 0x00}, {0x70, 0x09}, {0x71, 0x80}, {0x70, 0x05} }; for (;;) { /* get the image remaining blocks */ usb_bulk_msg(gspca_dev->dev, gspca_dev->urb[0]->pipe, gspca_dev->urb[0]->transfer_buffer, JEILINJ_MAX_TRANSFER, NULL, JEILINJ_DATA_TIMEOUT); /* search for 0xff 0xd9 (EOF for JPEG) */ i = 0; buf = gspca_dev->urb[0]->transfer_buffer; while ((i < (JEILINJ_MAX_TRANSFER - 1)) && ((buf[i] != 0xff) || (buf[i+1] != 0xd9))) i++; if (i != (JEILINJ_MAX_TRANSFER - 1)) /* last remaining block found */ break; } for (i = 0; i < ARRAY_SIZE(stop_commands); i++) jlj_write2(gspca_dev, stop_commands[i]); } /* this function is called at probe and resume time */ static int sd_init(struct gspca_dev *gspca_dev) { return gspca_dev->usb_err; } /* Set up for getting frames. */ static int sd_start(struct gspca_dev *gspca_dev) { struct sd *dev = (struct sd *) gspca_dev; /* create the JPEG header */ jpeg_define(dev->jpeg_hdr, gspca_dev->pixfmt.height, gspca_dev->pixfmt.width, 0x21); /* JPEG 422 */ jpeg_set_qual(dev->jpeg_hdr, dev->quality); gspca_dbg(gspca_dev, D_STREAM, "Start streaming at %dx%d\n", gspca_dev->pixfmt.height, gspca_dev->pixfmt.width); jlj_start(gspca_dev); return gspca_dev->usb_err; } /* Table of supported USB devices */ static const struct usb_device_id device_table[] = { {USB_DEVICE(0x0979, 0x0280), .driver_info = SAKAR_57379}, {USB_DEVICE(0x0979, 0x0270), .driver_info = SPORTSCAM_DV15}, {} }; MODULE_DEVICE_TABLE(usb, device_table); static int sd_s_ctrl(struct v4l2_ctrl *ctrl) { struct gspca_dev *gspca_dev = container_of(ctrl->handler, struct gspca_dev, ctrl_handler); struct sd *sd = (struct sd *)gspca_dev; gspca_dev->usb_err = 0; if (!gspca_dev->streaming) return 0; switch (ctrl->id) { case V4L2_CID_POWER_LINE_FREQUENCY: setfreq(gspca_dev, ctrl->val); break; case V4L2_CID_RED_BALANCE: setred(gspca_dev, ctrl->val); break; case V4L2_CID_GAIN: setgreen(gspca_dev, ctrl->val); break; case V4L2_CID_BLUE_BALANCE: setblue(gspca_dev, ctrl->val); break; case V4L2_CID_AUTOGAIN: setautogain(gspca_dev, ctrl->val); break; case V4L2_CID_JPEG_COMPRESSION_QUALITY: jpeg_set_qual(sd->jpeg_hdr, ctrl->val); setcamquality(gspca_dev, ctrl->val); break; } return gspca_dev->usb_err; } static const struct v4l2_ctrl_ops sd_ctrl_ops = { .s_ctrl = sd_s_ctrl, }; static int sd_init_controls(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *)gspca_dev; struct v4l2_ctrl_handler *hdl = &gspca_dev->ctrl_handler; static const struct v4l2_ctrl_config custom_autogain = { .ops = &sd_ctrl_ops, .id = V4L2_CID_AUTOGAIN, .type = V4L2_CTRL_TYPE_INTEGER, .name = "Automatic Gain (and Exposure)", .max = 3, .step = 1, .def = 0, }; gspca_dev->vdev.ctrl_handler = hdl; v4l2_ctrl_handler_init(hdl, 6); sd->freq = v4l2_ctrl_new_std_menu(hdl, &sd_ctrl_ops, V4L2_CID_POWER_LINE_FREQUENCY, V4L2_CID_POWER_LINE_FREQUENCY_60HZ, 1, V4L2_CID_POWER_LINE_FREQUENCY_60HZ); v4l2_ctrl_new_custom(hdl, &custom_autogain, NULL); v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_RED_BALANCE, 0, 3, 1, 2); v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_GAIN, 0, 3, 1, 2); v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_BLUE_BALANCE, 0, 3, 1, 2); sd->jpegqual = v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_JPEG_COMPRESSION_QUALITY, QUALITY_MIN, QUALITY_MAX, 1, QUALITY_DEF); if (hdl->error) { pr_err("Could not initialize controls\n"); return hdl->error; } return 0; } static int sd_set_jcomp(struct gspca_dev *gspca_dev, const struct v4l2_jpegcompression *jcomp) { struct sd *sd = (struct sd *) gspca_dev; v4l2_ctrl_s_ctrl(sd->jpegqual, jcomp->quality); return 0; } static int sd_get_jcomp(struct gspca_dev *gspca_dev, struct v4l2_jpegcompression *jcomp) { struct sd *sd = (struct sd *) gspca_dev; memset(jcomp, 0, sizeof *jcomp); jcomp->quality = v4l2_ctrl_g_ctrl(sd->jpegqual); jcomp->jpeg_markers = V4L2_JPEG_MARKER_DHT | V4L2_JPEG_MARKER_DQT; return 0; } /* sub-driver description */ static const struct sd_desc sd_desc_sakar_57379 = { .name = MODULE_NAME, .config = sd_config, .init = sd_init, .start = sd_start, .stopN = sd_stopN, .pkt_scan = sd_pkt_scan, }; /* sub-driver description */ static const struct sd_desc sd_desc_sportscam_dv15 = { .name = MODULE_NAME, .config = sd_config, .init = sd_init, .init_controls = sd_init_controls, .start = sd_start, .stopN = sd_stopN, .pkt_scan = sd_pkt_scan, .get_jcomp = sd_get_jcomp, .set_jcomp = sd_set_jcomp, }; static const struct sd_desc *sd_desc[2] = { &sd_desc_sakar_57379, &sd_desc_sportscam_dv15 }; /* -- device connect -- */ static int sd_probe(struct usb_interface *intf, const struct usb_device_id *id) { return gspca_dev_probe(intf, id, sd_desc[id->driver_info], sizeof(struct sd), THIS_MODULE); } static struct usb_driver sd_driver = { .name = MODULE_NAME, .id_table = device_table, .probe = sd_probe, .disconnect = gspca_disconnect, #ifdef CONFIG_PM .suspend = gspca_suspend, .resume = gspca_resume, .reset_resume = gspca_resume, #endif }; module_usb_driver(sd_driver);
42 42 37 38 37 38 2 2 2 38 37 35 5 5 30 1 1 1 4 13 13 13 13 13 13 9 7 2 2 2 5 16 16 6 6 6 42 34 6 38 1 5 2 3 1 1 1 1 5 1 2 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Simon Wunderlich */ #include "bridge_loop_avoidance.h" #include "main.h" #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/compiler.h> #include <linux/container_of.h> #include <linux/crc16.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/sprintf.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/workqueue.h> #include <net/arp.h> #include <net/genetlink.h> #include <net/netlink.h> #include <net/sock.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "hard-interface.h" #include "hash.h" #include "log.h" #include "netlink.h" #include "originator.h" #include "soft-interface.h" #include "translation-table.h" static const u8 batadv_announce_mac[4] = {0x43, 0x05, 0x43, 0x05}; static void batadv_bla_periodic_work(struct work_struct *work); static void batadv_bla_send_announce(struct batadv_priv *bat_priv, struct batadv_bla_backbone_gw *backbone_gw); /** * batadv_choose_claim() - choose the right bucket for a claim. * @data: data to hash * @size: size of the hash table * * Return: the hash index of the claim */ static inline u32 batadv_choose_claim(const void *data, u32 size) { const struct batadv_bla_claim *claim = data; u32 hash = 0; hash = jhash(&claim->addr, sizeof(claim->addr), hash); hash = jhash(&claim->vid, sizeof(claim->vid), hash); return hash % size; } /** * batadv_choose_backbone_gw() - choose the right bucket for a backbone gateway. * @data: data to hash * @size: size of the hash table * * Return: the hash index of the backbone gateway */ static inline u32 batadv_choose_backbone_gw(const void *data, u32 size) { const struct batadv_bla_backbone_gw *gw; u32 hash = 0; gw = data; hash = jhash(&gw->orig, sizeof(gw->orig), hash); hash = jhash(&gw->vid, sizeof(gw->vid), hash); return hash % size; } /** * batadv_compare_backbone_gw() - compare address and vid of two backbone gws * @node: list node of the first entry to compare * @data2: pointer to the second backbone gateway * * Return: true if the backbones have the same data, false otherwise */ static bool batadv_compare_backbone_gw(const struct hlist_node *node, const void *data2) { const void *data1 = container_of(node, struct batadv_bla_backbone_gw, hash_entry); const struct batadv_bla_backbone_gw *gw1 = data1; const struct batadv_bla_backbone_gw *gw2 = data2; if (!batadv_compare_eth(gw1->orig, gw2->orig)) return false; if (gw1->vid != gw2->vid) return false; return true; } /** * batadv_compare_claim() - compare address and vid of two claims * @node: list node of the first entry to compare * @data2: pointer to the second claims * * Return: true if the claim have the same data, 0 otherwise */ static bool batadv_compare_claim(const struct hlist_node *node, const void *data2) { const void *data1 = container_of(node, struct batadv_bla_claim, hash_entry); const struct batadv_bla_claim *cl1 = data1; const struct batadv_bla_claim *cl2 = data2; if (!batadv_compare_eth(cl1->addr, cl2->addr)) return false; if (cl1->vid != cl2->vid) return false; return true; } /** * batadv_backbone_gw_release() - release backbone gw from lists and queue for * free after rcu grace period * @ref: kref pointer of the backbone gw */ static void batadv_backbone_gw_release(struct kref *ref) { struct batadv_bla_backbone_gw *backbone_gw; backbone_gw = container_of(ref, struct batadv_bla_backbone_gw, refcount); kfree_rcu(backbone_gw, rcu); } /** * batadv_backbone_gw_put() - decrement the backbone gw refcounter and possibly * release it * @backbone_gw: backbone gateway to be free'd */ static void batadv_backbone_gw_put(struct batadv_bla_backbone_gw *backbone_gw) { if (!backbone_gw) return; kref_put(&backbone_gw->refcount, batadv_backbone_gw_release); } /** * batadv_claim_release() - release claim from lists and queue for free after * rcu grace period * @ref: kref pointer of the claim */ static void batadv_claim_release(struct kref *ref) { struct batadv_bla_claim *claim; struct batadv_bla_backbone_gw *old_backbone_gw; claim = container_of(ref, struct batadv_bla_claim, refcount); spin_lock_bh(&claim->backbone_lock); old_backbone_gw = claim->backbone_gw; claim->backbone_gw = NULL; spin_unlock_bh(&claim->backbone_lock); spin_lock_bh(&old_backbone_gw->crc_lock); old_backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); spin_unlock_bh(&old_backbone_gw->crc_lock); batadv_backbone_gw_put(old_backbone_gw); kfree_rcu(claim, rcu); } /** * batadv_claim_put() - decrement the claim refcounter and possibly release it * @claim: claim to be free'd */ static void batadv_claim_put(struct batadv_bla_claim *claim) { if (!claim) return; kref_put(&claim->refcount, batadv_claim_release); } /** * batadv_claim_hash_find() - looks for a claim in the claim hash * @bat_priv: the bat priv with all the soft interface information * @data: search data (may be local/static data) * * Return: claim if found or NULL otherwise. */ static struct batadv_bla_claim * batadv_claim_hash_find(struct batadv_priv *bat_priv, struct batadv_bla_claim *data) { struct batadv_hashtable *hash = bat_priv->bla.claim_hash; struct hlist_head *head; struct batadv_bla_claim *claim; struct batadv_bla_claim *claim_tmp = NULL; int index; if (!hash) return NULL; index = batadv_choose_claim(data, hash->size); head = &hash->table[index]; rcu_read_lock(); hlist_for_each_entry_rcu(claim, head, hash_entry) { if (!batadv_compare_claim(&claim->hash_entry, data)) continue; if (!kref_get_unless_zero(&claim->refcount)) continue; claim_tmp = claim; break; } rcu_read_unlock(); return claim_tmp; } /** * batadv_backbone_hash_find() - looks for a backbone gateway in the hash * @bat_priv: the bat priv with all the soft interface information * @addr: the address of the originator * @vid: the VLAN ID * * Return: backbone gateway if found or NULL otherwise */ static struct batadv_bla_backbone_gw * batadv_backbone_hash_find(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { struct batadv_hashtable *hash = bat_priv->bla.backbone_hash; struct hlist_head *head; struct batadv_bla_backbone_gw search_entry, *backbone_gw; struct batadv_bla_backbone_gw *backbone_gw_tmp = NULL; int index; if (!hash) return NULL; ether_addr_copy(search_entry.orig, addr); search_entry.vid = vid; index = batadv_choose_backbone_gw(&search_entry, hash->size); head = &hash->table[index]; rcu_read_lock(); hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) { if (!batadv_compare_backbone_gw(&backbone_gw->hash_entry, &search_entry)) continue; if (!kref_get_unless_zero(&backbone_gw->refcount)) continue; backbone_gw_tmp = backbone_gw; break; } rcu_read_unlock(); return backbone_gw_tmp; } /** * batadv_bla_del_backbone_claims() - delete all claims for a backbone * @backbone_gw: backbone gateway where the claims should be removed */ static void batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw) { struct batadv_hashtable *hash; struct hlist_node *node_tmp; struct hlist_head *head; struct batadv_bla_claim *claim; int i; spinlock_t *list_lock; /* protects write access to the hash lists */ hash = backbone_gw->bat_priv->bla.claim_hash; if (!hash) return; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); hlist_for_each_entry_safe(claim, node_tmp, head, hash_entry) { if (claim->backbone_gw != backbone_gw) continue; batadv_claim_put(claim); hlist_del_rcu(&claim->hash_entry); } spin_unlock_bh(list_lock); } /* all claims gone, initialize CRC */ spin_lock_bh(&backbone_gw->crc_lock); backbone_gw->crc = BATADV_BLA_CRC_INIT; spin_unlock_bh(&backbone_gw->crc_lock); } /** * batadv_bla_send_claim() - sends a claim frame according to the provided info * @bat_priv: the bat priv with all the soft interface information * @mac: the mac address to be announced within the claim * @vid: the VLAN ID * @claimtype: the type of the claim (CLAIM, UNCLAIM, ANNOUNCE, ...) */ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, const u8 *mac, unsigned short vid, int claimtype) { struct sk_buff *skb; struct ethhdr *ethhdr; struct batadv_hard_iface *primary_if; struct net_device *soft_iface; u8 *hw_src; struct batadv_bla_claim_dst local_claim_dest; __be32 zeroip = 0; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) return; memcpy(&local_claim_dest, &bat_priv->bla.claim_dest, sizeof(local_claim_dest)); local_claim_dest.type = claimtype; soft_iface = primary_if->soft_iface; skb = arp_create(ARPOP_REPLY, ETH_P_ARP, /* IP DST: 0.0.0.0 */ zeroip, primary_if->soft_iface, /* IP SRC: 0.0.0.0 */ zeroip, /* Ethernet DST: Broadcast */ NULL, /* Ethernet SRC/HW SRC: originator mac */ primary_if->net_dev->dev_addr, /* HW DST: FF:43:05:XX:YY:YY * with XX = claim type * and YY:YY = group id */ (u8 *)&local_claim_dest); if (!skb) goto out; ethhdr = (struct ethhdr *)skb->data; hw_src = (u8 *)ethhdr + ETH_HLEN + sizeof(struct arphdr); /* now we pretend that the client would have sent this ... */ switch (claimtype) { case BATADV_CLAIM_TYPE_CLAIM: /* normal claim frame * set Ethernet SRC to the clients mac */ ether_addr_copy(ethhdr->h_source, mac); batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): CLAIM %pM on vid %d\n", __func__, mac, batadv_print_vid(vid)); break; case BATADV_CLAIM_TYPE_UNCLAIM: /* unclaim frame * set HW SRC to the clients mac */ ether_addr_copy(hw_src, mac); batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): UNCLAIM %pM on vid %d\n", __func__, mac, batadv_print_vid(vid)); break; case BATADV_CLAIM_TYPE_ANNOUNCE: /* announcement frame * set HW SRC to the special mac containing the crc */ ether_addr_copy(hw_src, mac); batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): ANNOUNCE of %pM on vid %d\n", __func__, ethhdr->h_source, batadv_print_vid(vid)); break; case BATADV_CLAIM_TYPE_REQUEST: /* request frame * set HW SRC and header destination to the receiving backbone * gws mac */ ether_addr_copy(hw_src, mac); ether_addr_copy(ethhdr->h_dest, mac); batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): REQUEST of %pM to %pM on vid %d\n", __func__, ethhdr->h_source, ethhdr->h_dest, batadv_print_vid(vid)); break; case BATADV_CLAIM_TYPE_LOOPDETECT: ether_addr_copy(ethhdr->h_source, mac); batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): LOOPDETECT of %pM to %pM on vid %d\n", __func__, ethhdr->h_source, ethhdr->h_dest, batadv_print_vid(vid)); break; } if (vid & BATADV_VLAN_HAS_TAG) { skb = vlan_insert_tag(skb, htons(ETH_P_8021Q), vid & VLAN_VID_MASK); if (!skb) goto out; } skb_reset_mac_header(skb); skb->protocol = eth_type_trans(skb, soft_iface); batadv_inc_counter(bat_priv, BATADV_CNT_RX); batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES, skb->len + ETH_HLEN); netif_rx(skb); out: batadv_hardif_put(primary_if); } /** * batadv_bla_loopdetect_report() - worker for reporting the loop * @work: work queue item * * Throws an uevent, as the loopdetect check function can't do that itself * since the kernel may sleep while throwing uevents. */ static void batadv_bla_loopdetect_report(struct work_struct *work) { struct batadv_bla_backbone_gw *backbone_gw; struct batadv_priv *bat_priv; char vid_str[6] = { '\0' }; backbone_gw = container_of(work, struct batadv_bla_backbone_gw, report_work); bat_priv = backbone_gw->bat_priv; batadv_info(bat_priv->soft_iface, "Possible loop on VLAN %d detected which can't be handled by BLA - please check your network setup!\n", batadv_print_vid(backbone_gw->vid)); snprintf(vid_str, sizeof(vid_str), "%d", batadv_print_vid(backbone_gw->vid)); vid_str[sizeof(vid_str) - 1] = 0; batadv_throw_uevent(bat_priv, BATADV_UEV_BLA, BATADV_UEV_LOOPDETECT, vid_str); batadv_backbone_gw_put(backbone_gw); } /** * batadv_bla_get_backbone_gw() - finds or creates a backbone gateway * @bat_priv: the bat priv with all the soft interface information * @orig: the mac address of the originator * @vid: the VLAN ID * @own_backbone: set if the requested backbone is local * * Return: the (possibly created) backbone gateway or NULL on error */ static struct batadv_bla_backbone_gw * batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, const u8 *orig, unsigned short vid, bool own_backbone) { struct batadv_bla_backbone_gw *entry; struct batadv_orig_node *orig_node; int hash_added; entry = batadv_backbone_hash_find(bat_priv, orig, vid); if (entry) return entry; batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): not found (%pM, %d), creating new entry\n", __func__, orig, batadv_print_vid(vid)); entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return NULL; entry->vid = vid; entry->lasttime = jiffies; entry->crc = BATADV_BLA_CRC_INIT; entry->bat_priv = bat_priv; spin_lock_init(&entry->crc_lock); atomic_set(&entry->request_sent, 0); atomic_set(&entry->wait_periods, 0); ether_addr_copy(entry->orig, orig); INIT_WORK(&entry->report_work, batadv_bla_loopdetect_report); kref_init(&entry->refcount); kref_get(&entry->refcount); hash_added = batadv_hash_add(bat_priv->bla.backbone_hash, batadv_compare_backbone_gw, batadv_choose_backbone_gw, entry, &entry->hash_entry); if (unlikely(hash_added != 0)) { /* hash failed, free the structure */ kfree(entry); return NULL; } /* this is a gateway now, remove any TT entry on this VLAN */ orig_node = batadv_orig_hash_find(bat_priv, orig); if (orig_node) { batadv_tt_global_del_orig(bat_priv, orig_node, vid, "became a backbone gateway"); batadv_orig_node_put(orig_node); } if (own_backbone) { batadv_bla_send_announce(bat_priv, entry); /* this will be decreased in the worker thread */ atomic_inc(&entry->request_sent); atomic_set(&entry->wait_periods, BATADV_BLA_WAIT_PERIODS); atomic_inc(&bat_priv->bla.num_requests); } return entry; } /** * batadv_bla_update_own_backbone_gw() - updates the own backbone gw for a VLAN * @bat_priv: the bat priv with all the soft interface information * @primary_if: the selected primary interface * @vid: VLAN identifier * * update or add the own backbone gw to make sure we announce * where we receive other backbone gws */ static void batadv_bla_update_own_backbone_gw(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; backbone_gw = batadv_bla_get_backbone_gw(bat_priv, primary_if->net_dev->dev_addr, vid, true); if (unlikely(!backbone_gw)) return; backbone_gw->lasttime = jiffies; batadv_backbone_gw_put(backbone_gw); } /** * batadv_bla_answer_request() - answer a bla request by sending own claims * @bat_priv: the bat priv with all the soft interface information * @primary_if: interface where the request came on * @vid: the vid where the request came on * * Repeat all of our own claims, and finally send an ANNOUNCE frame * to allow the requester another check if the CRC is correct now. */ static void batadv_bla_answer_request(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, unsigned short vid) { struct hlist_head *head; struct batadv_hashtable *hash; struct batadv_bla_claim *claim; struct batadv_bla_backbone_gw *backbone_gw; int i; batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): received a claim request, send all of our own claims again\n", __func__); backbone_gw = batadv_backbone_hash_find(bat_priv, primary_if->net_dev->dev_addr, vid); if (!backbone_gw) return; hash = bat_priv->bla.claim_hash; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(claim, head, hash_entry) { /* only own claims are interesting */ if (claim->backbone_gw != backbone_gw) continue; batadv_bla_send_claim(bat_priv, claim->addr, claim->vid, BATADV_CLAIM_TYPE_CLAIM); } rcu_read_unlock(); } /* finally, send an announcement frame */ batadv_bla_send_announce(bat_priv, backbone_gw); batadv_backbone_gw_put(backbone_gw); } /** * batadv_bla_send_request() - send a request to repeat claims * @backbone_gw: the backbone gateway from whom we are out of sync * * When the crc is wrong, ask the backbone gateway for a full table update. * After the request, it will repeat all of his own claims and finally * send an announcement claim with which we can check again. */ static void batadv_bla_send_request(struct batadv_bla_backbone_gw *backbone_gw) { /* first, remove all old entries */ batadv_bla_del_backbone_claims(backbone_gw); batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv, "Sending REQUEST to %pM\n", backbone_gw->orig); /* send request */ batadv_bla_send_claim(backbone_gw->bat_priv, backbone_gw->orig, backbone_gw->vid, BATADV_CLAIM_TYPE_REQUEST); /* no local broadcasts should be sent or received, for now. */ if (!atomic_read(&backbone_gw->request_sent)) { atomic_inc(&backbone_gw->bat_priv->bla.num_requests); atomic_set(&backbone_gw->request_sent, 1); } } /** * batadv_bla_send_announce() - Send an announcement frame * @bat_priv: the bat priv with all the soft interface information * @backbone_gw: our backbone gateway which should be announced */ static void batadv_bla_send_announce(struct batadv_priv *bat_priv, struct batadv_bla_backbone_gw *backbone_gw) { u8 mac[ETH_ALEN]; __be16 crc; memcpy(mac, batadv_announce_mac, 4); spin_lock_bh(&backbone_gw->crc_lock); crc = htons(backbone_gw->crc); spin_unlock_bh(&backbone_gw->crc_lock); memcpy(&mac[4], &crc, 2); batadv_bla_send_claim(bat_priv, mac, backbone_gw->vid, BATADV_CLAIM_TYPE_ANNOUNCE); } /** * batadv_bla_add_claim() - Adds a claim in the claim hash * @bat_priv: the bat priv with all the soft interface information * @mac: the mac address of the claim * @vid: the VLAN ID of the frame * @backbone_gw: the backbone gateway which claims it */ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, const u8 *mac, const unsigned short vid, struct batadv_bla_backbone_gw *backbone_gw) { struct batadv_bla_backbone_gw *old_backbone_gw; struct batadv_bla_claim *claim; struct batadv_bla_claim search_claim; bool remove_crc = false; int hash_added; ether_addr_copy(search_claim.addr, mac); search_claim.vid = vid; claim = batadv_claim_hash_find(bat_priv, &search_claim); /* create a new claim entry if it does not exist yet. */ if (!claim) { claim = kzalloc(sizeof(*claim), GFP_ATOMIC); if (!claim) return; ether_addr_copy(claim->addr, mac); spin_lock_init(&claim->backbone_lock); claim->vid = vid; claim->lasttime = jiffies; kref_get(&backbone_gw->refcount); claim->backbone_gw = backbone_gw; kref_init(&claim->refcount); batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): adding new entry %pM, vid %d to hash ...\n", __func__, mac, batadv_print_vid(vid)); kref_get(&claim->refcount); hash_added = batadv_hash_add(bat_priv->bla.claim_hash, batadv_compare_claim, batadv_choose_claim, claim, &claim->hash_entry); if (unlikely(hash_added != 0)) { /* only local changes happened. */ kfree(claim); return; } } else { claim->lasttime = jiffies; if (claim->backbone_gw == backbone_gw) /* no need to register a new backbone */ goto claim_free_ref; batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): changing ownership for %pM, vid %d to gw %pM\n", __func__, mac, batadv_print_vid(vid), backbone_gw->orig); remove_crc = true; } /* replace backbone_gw atomically and adjust reference counters */ spin_lock_bh(&claim->backbone_lock); old_backbone_gw = claim->backbone_gw; kref_get(&backbone_gw->refcount); claim->backbone_gw = backbone_gw; spin_unlock_bh(&claim->backbone_lock); if (remove_crc) { /* remove claim address from old backbone_gw */ spin_lock_bh(&old_backbone_gw->crc_lock); old_backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); spin_unlock_bh(&old_backbone_gw->crc_lock); } batadv_backbone_gw_put(old_backbone_gw); /* add claim address to new backbone_gw */ spin_lock_bh(&backbone_gw->crc_lock); backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); spin_unlock_bh(&backbone_gw->crc_lock); backbone_gw->lasttime = jiffies; claim_free_ref: batadv_claim_put(claim); } /** * batadv_bla_claim_get_backbone_gw() - Get valid reference for backbone_gw of * claim * @claim: claim whose backbone_gw should be returned * * Return: valid reference to claim::backbone_gw */ static struct batadv_bla_backbone_gw * batadv_bla_claim_get_backbone_gw(struct batadv_bla_claim *claim) { struct batadv_bla_backbone_gw *backbone_gw; spin_lock_bh(&claim->backbone_lock); backbone_gw = claim->backbone_gw; kref_get(&backbone_gw->refcount); spin_unlock_bh(&claim->backbone_lock); return backbone_gw; } /** * batadv_bla_del_claim() - delete a claim from the claim hash * @bat_priv: the bat priv with all the soft interface information * @mac: mac address of the claim to be removed * @vid: VLAN id for the claim to be removed */ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, const u8 *mac, const unsigned short vid) { struct batadv_bla_claim search_claim, *claim; struct batadv_bla_claim *claim_removed_entry; struct hlist_node *claim_removed_node; ether_addr_copy(search_claim.addr, mac); search_claim.vid = vid; claim = batadv_claim_hash_find(bat_priv, &search_claim); if (!claim) return; batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): %pM, vid %d\n", __func__, mac, batadv_print_vid(vid)); claim_removed_node = batadv_hash_remove(bat_priv->bla.claim_hash, batadv_compare_claim, batadv_choose_claim, claim); if (!claim_removed_node) goto free_claim; /* reference from the hash is gone */ claim_removed_entry = hlist_entry(claim_removed_node, struct batadv_bla_claim, hash_entry); batadv_claim_put(claim_removed_entry); free_claim: /* don't need the reference from hash_find() anymore */ batadv_claim_put(claim); } /** * batadv_handle_announce() - check for ANNOUNCE frame * @bat_priv: the bat priv with all the soft interface information * @an_addr: announcement mac address (ARP Sender HW address) * @backbone_addr: originator address of the sender (Ethernet source MAC) * @vid: the VLAN ID of the frame * * Return: true if handled */ static bool batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, u8 *backbone_addr, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; u16 backbone_crc, crc; if (memcmp(an_addr, batadv_announce_mac, 4) != 0) return false; backbone_gw = batadv_bla_get_backbone_gw(bat_priv, backbone_addr, vid, false); if (unlikely(!backbone_gw)) return true; /* handle as ANNOUNCE frame */ backbone_gw->lasttime = jiffies; crc = ntohs(*((__force __be16 *)(&an_addr[4]))); batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): ANNOUNCE vid %d (sent by %pM)... CRC = %#.4x\n", __func__, batadv_print_vid(vid), backbone_gw->orig, crc); spin_lock_bh(&backbone_gw->crc_lock); backbone_crc = backbone_gw->crc; spin_unlock_bh(&backbone_gw->crc_lock); if (backbone_crc != crc) { batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv, "%s(): CRC FAILED for %pM/%d (my = %#.4x, sent = %#.4x)\n", __func__, backbone_gw->orig, batadv_print_vid(backbone_gw->vid), backbone_crc, crc); batadv_bla_send_request(backbone_gw); } else { /* if we have sent a request and the crc was OK, * we can allow traffic again. */ if (atomic_read(&backbone_gw->request_sent)) { atomic_dec(&backbone_gw->bat_priv->bla.num_requests); atomic_set(&backbone_gw->request_sent, 0); } } batadv_backbone_gw_put(backbone_gw); return true; } /** * batadv_handle_request() - check for REQUEST frame * @bat_priv: the bat priv with all the soft interface information * @primary_if: the primary hard interface of this batman soft interface * @backbone_addr: backbone address to be requested (ARP sender HW MAC) * @ethhdr: ethernet header of a packet * @vid: the VLAN ID of the frame * * Return: true if handled */ static bool batadv_handle_request(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, u8 *backbone_addr, struct ethhdr *ethhdr, unsigned short vid) { /* check for REQUEST frame */ if (!batadv_compare_eth(backbone_addr, ethhdr->h_dest)) return false; /* sanity check, this should not happen on a normal switch, * we ignore it in this case. */ if (!batadv_compare_eth(ethhdr->h_dest, primary_if->net_dev->dev_addr)) return true; batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): REQUEST vid %d (sent by %pM)...\n", __func__, batadv_print_vid(vid), ethhdr->h_source); batadv_bla_answer_request(bat_priv, primary_if, vid); return true; } /** * batadv_handle_unclaim() - check for UNCLAIM frame * @bat_priv: the bat priv with all the soft interface information * @primary_if: the primary hard interface of this batman soft interface * @backbone_addr: originator address of the backbone (Ethernet source) * @claim_addr: Client to be unclaimed (ARP sender HW MAC) * @vid: the VLAN ID of the frame * * Return: true if handled */ static bool batadv_handle_unclaim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, const u8 *backbone_addr, const u8 *claim_addr, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; /* unclaim in any case if it is our own */ if (primary_if && batadv_compare_eth(backbone_addr, primary_if->net_dev->dev_addr)) batadv_bla_send_claim(bat_priv, claim_addr, vid, BATADV_CLAIM_TYPE_UNCLAIM); backbone_gw = batadv_backbone_hash_find(bat_priv, backbone_addr, vid); if (!backbone_gw) return true; /* this must be an UNCLAIM frame */ batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): UNCLAIM %pM on vid %d (sent by %pM)...\n", __func__, claim_addr, batadv_print_vid(vid), backbone_gw->orig); batadv_bla_del_claim(bat_priv, claim_addr, vid); batadv_backbone_gw_put(backbone_gw); return true; } /** * batadv_handle_claim() - check for CLAIM frame * @bat_priv: the bat priv with all the soft interface information * @primary_if: the primary hard interface of this batman soft interface * @backbone_addr: originator address of the backbone (Ethernet Source) * @claim_addr: client mac address to be claimed (ARP sender HW MAC) * @vid: the VLAN ID of the frame * * Return: true if handled */ static bool batadv_handle_claim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, const u8 *backbone_addr, const u8 *claim_addr, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; /* register the gateway if not yet available, and add the claim. */ backbone_gw = batadv_bla_get_backbone_gw(bat_priv, backbone_addr, vid, false); if (unlikely(!backbone_gw)) return true; /* this must be a CLAIM frame */ batadv_bla_add_claim(bat_priv, claim_addr, vid, backbone_gw); if (batadv_compare_eth(backbone_addr, primary_if->net_dev->dev_addr)) batadv_bla_send_claim(bat_priv, claim_addr, vid, BATADV_CLAIM_TYPE_CLAIM); /* TODO: we could call something like tt_local_del() here. */ batadv_backbone_gw_put(backbone_gw); return true; } /** * batadv_check_claim_group() - check for claim group membership * @bat_priv: the bat priv with all the soft interface information * @primary_if: the primary interface of this batman interface * @hw_src: the Hardware source in the ARP Header * @hw_dst: the Hardware destination in the ARP Header * @ethhdr: pointer to the Ethernet header of the claim frame * * checks if it is a claim packet and if it's on the same group. * This function also applies the group ID of the sender * if it is in the same mesh. * * Return: * 2 - if it is a claim packet and on the same group * 1 - if is a claim packet from another group * 0 - if it is not a claim packet */ static int batadv_check_claim_group(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, u8 *hw_src, u8 *hw_dst, struct ethhdr *ethhdr) { u8 *backbone_addr; struct batadv_orig_node *orig_node; struct batadv_bla_claim_dst *bla_dst, *bla_dst_own; bla_dst = (struct batadv_bla_claim_dst *)hw_dst; bla_dst_own = &bat_priv->bla.claim_dest; /* if announcement packet, use the source, * otherwise assume it is in the hw_src */ switch (bla_dst->type) { case BATADV_CLAIM_TYPE_CLAIM: backbone_addr = hw_src; break; case BATADV_CLAIM_TYPE_REQUEST: case BATADV_CLAIM_TYPE_ANNOUNCE: case BATADV_CLAIM_TYPE_UNCLAIM: backbone_addr = ethhdr->h_source; break; default: return 0; } /* don't accept claim frames from ourselves */ if (batadv_compare_eth(backbone_addr, primary_if->net_dev->dev_addr)) return 0; /* if its already the same group, it is fine. */ if (bla_dst->group == bla_dst_own->group) return 2; /* lets see if this originator is in our mesh */ orig_node = batadv_orig_hash_find(bat_priv, backbone_addr); /* don't accept claims from gateways which are not in * the same mesh or group. */ if (!orig_node) return 1; /* if our mesh friends mac is bigger, use it for ourselves. */ if (ntohs(bla_dst->group) > ntohs(bla_dst_own->group)) { batadv_dbg(BATADV_DBG_BLA, bat_priv, "taking other backbones claim group: %#.4x\n", ntohs(bla_dst->group)); bla_dst_own->group = bla_dst->group; } batadv_orig_node_put(orig_node); return 2; } /** * batadv_bla_process_claim() - Check if this is a claim frame, and process it * @bat_priv: the bat priv with all the soft interface information * @primary_if: the primary hard interface of this batman soft interface * @skb: the frame to be checked * * Return: true if it was a claim frame, otherwise return false to * tell the callee that it can use the frame on its own. */ static bool batadv_bla_process_claim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, struct sk_buff *skb) { struct batadv_bla_claim_dst *bla_dst, *bla_dst_own; u8 *hw_src, *hw_dst; struct vlan_hdr *vhdr, vhdr_buf; struct ethhdr *ethhdr; struct arphdr *arphdr; unsigned short vid; int vlan_depth = 0; __be16 proto; int headlen; int ret; vid = batadv_get_vid(skb, 0); ethhdr = eth_hdr(skb); proto = ethhdr->h_proto; headlen = ETH_HLEN; if (vid & BATADV_VLAN_HAS_TAG) { /* Traverse the VLAN/Ethertypes. * * At this point it is known that the first protocol is a VLAN * header, so start checking at the encapsulated protocol. * * The depth of the VLAN headers is recorded to drop BLA claim * frames encapsulated into multiple VLAN headers (QinQ). */ do { vhdr = skb_header_pointer(skb, headlen, VLAN_HLEN, &vhdr_buf); if (!vhdr) return false; proto = vhdr->h_vlan_encapsulated_proto; headlen += VLAN_HLEN; vlan_depth++; } while (proto == htons(ETH_P_8021Q)); } if (proto != htons(ETH_P_ARP)) return false; /* not a claim frame */ /* this must be a ARP frame. check if it is a claim. */ if (unlikely(!pskb_may_pull(skb, headlen + arp_hdr_len(skb->dev)))) return false; /* pskb_may_pull() may have modified the pointers, get ethhdr again */ ethhdr = eth_hdr(skb); arphdr = (struct arphdr *)((u8 *)ethhdr + headlen); /* Check whether the ARP frame carries a valid * IP information */ if (arphdr->ar_hrd != htons(ARPHRD_ETHER)) return false; if (arphdr->ar_pro != htons(ETH_P_IP)) return false; if (arphdr->ar_hln != ETH_ALEN) return false; if (arphdr->ar_pln != 4) return false; hw_src = (u8 *)arphdr + sizeof(struct arphdr); hw_dst = hw_src + ETH_ALEN + 4; bla_dst = (struct batadv_bla_claim_dst *)hw_dst; bla_dst_own = &bat_priv->bla.claim_dest; /* check if it is a claim frame in general */ if (memcmp(bla_dst->magic, bla_dst_own->magic, sizeof(bla_dst->magic)) != 0) return false; /* check if there is a claim frame encapsulated deeper in (QinQ) and * drop that, as this is not supported by BLA but should also not be * sent via the mesh. */ if (vlan_depth > 1) return true; /* Let the loopdetect frames on the mesh in any case. */ if (bla_dst->type == BATADV_CLAIM_TYPE_LOOPDETECT) return false; /* check if it is a claim frame. */ ret = batadv_check_claim_group(bat_priv, primary_if, hw_src, hw_dst, ethhdr); if (ret == 1) batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): received a claim frame from another group. From: %pM on vid %d ...(hw_src %pM, hw_dst %pM)\n", __func__, ethhdr->h_source, batadv_print_vid(vid), hw_src, hw_dst); if (ret < 2) return !!ret; /* become a backbone gw ourselves on this vlan if not happened yet */ batadv_bla_update_own_backbone_gw(bat_priv, primary_if, vid); /* check for the different types of claim frames ... */ switch (bla_dst->type) { case BATADV_CLAIM_TYPE_CLAIM: if (batadv_handle_claim(bat_priv, primary_if, hw_src, ethhdr->h_source, vid)) return true; break; case BATADV_CLAIM_TYPE_UNCLAIM: if (batadv_handle_unclaim(bat_priv, primary_if, ethhdr->h_source, hw_src, vid)) return true; break; case BATADV_CLAIM_TYPE_ANNOUNCE: if (batadv_handle_announce(bat_priv, hw_src, ethhdr->h_source, vid)) return true; break; case BATADV_CLAIM_TYPE_REQUEST: if (batadv_handle_request(bat_priv, primary_if, hw_src, ethhdr, vid)) return true; break; } batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): ERROR - this looks like a claim frame, but is useless. eth src %pM on vid %d ...(hw_src %pM, hw_dst %pM)\n", __func__, ethhdr->h_source, batadv_print_vid(vid), hw_src, hw_dst); return true; } /** * batadv_bla_purge_backbone_gw() - Remove backbone gateways after a timeout or * immediately * @bat_priv: the bat priv with all the soft interface information * @now: whether the whole hash shall be wiped now * * Check when we last heard from other nodes, and remove them in case of * a time out, or clean all backbone gws if now is set. */ static void batadv_bla_purge_backbone_gw(struct batadv_priv *bat_priv, int now) { struct batadv_bla_backbone_gw *backbone_gw; struct hlist_node *node_tmp; struct hlist_head *head; struct batadv_hashtable *hash; spinlock_t *list_lock; /* protects write access to the hash lists */ int i; hash = bat_priv->bla.backbone_hash; if (!hash) return; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); hlist_for_each_entry_safe(backbone_gw, node_tmp, head, hash_entry) { if (now) goto purge_now; if (!batadv_has_timed_out(backbone_gw->lasttime, BATADV_BLA_BACKBONE_TIMEOUT)) continue; batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv, "%s(): backbone gw %pM timed out\n", __func__, backbone_gw->orig); purge_now: /* don't wait for the pending request anymore */ if (atomic_read(&backbone_gw->request_sent)) atomic_dec(&bat_priv->bla.num_requests); batadv_bla_del_backbone_claims(backbone_gw); hlist_del_rcu(&backbone_gw->hash_entry); batadv_backbone_gw_put(backbone_gw); } spin_unlock_bh(list_lock); } } /** * batadv_bla_purge_claims() - Remove claims after a timeout or immediately * @bat_priv: the bat priv with all the soft interface information * @primary_if: the selected primary interface, may be NULL if now is set * @now: whether the whole hash shall be wiped now * * Check when we heard last time from our own claims, and remove them in case of * a time out, or clean all claims if now is set */ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, int now) { struct batadv_bla_backbone_gw *backbone_gw; struct batadv_bla_claim *claim; struct hlist_head *head; struct batadv_hashtable *hash; int i; hash = bat_priv->bla.claim_hash; if (!hash) return; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(claim, head, hash_entry) { backbone_gw = batadv_bla_claim_get_backbone_gw(claim); if (now) goto purge_now; if (!batadv_compare_eth(backbone_gw->orig, primary_if->net_dev->dev_addr)) goto skip; if (!batadv_has_timed_out(claim->lasttime, BATADV_BLA_CLAIM_TIMEOUT)) goto skip; batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): timed out.\n", __func__); purge_now: batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): %pM, vid %d\n", __func__, claim->addr, claim->vid); batadv_handle_unclaim(bat_priv, primary_if, backbone_gw->orig, claim->addr, claim->vid); skip: batadv_backbone_gw_put(backbone_gw); } rcu_read_unlock(); } } /** * batadv_bla_update_orig_address() - Update the backbone gateways when the own * originator address changes * @bat_priv: the bat priv with all the soft interface information * @primary_if: the new selected primary_if * @oldif: the old primary interface, may be NULL */ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, struct batadv_hard_iface *oldif) { struct batadv_bla_backbone_gw *backbone_gw; struct hlist_head *head; struct batadv_hashtable *hash; __be16 group; int i; /* reset bridge loop avoidance group id */ group = htons(crc16(0, primary_if->net_dev->dev_addr, ETH_ALEN)); bat_priv->bla.claim_dest.group = group; /* purge everything when bridge loop avoidance is turned off */ if (!atomic_read(&bat_priv->bridge_loop_avoidance)) oldif = NULL; if (!oldif) { batadv_bla_purge_claims(bat_priv, NULL, 1); batadv_bla_purge_backbone_gw(bat_priv, 1); return; } hash = bat_priv->bla.backbone_hash; if (!hash) return; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) { /* own orig still holds the old value. */ if (!batadv_compare_eth(backbone_gw->orig, oldif->net_dev->dev_addr)) continue; ether_addr_copy(backbone_gw->orig, primary_if->net_dev->dev_addr); /* send an announce frame so others will ask for our * claims and update their tables. */ batadv_bla_send_announce(bat_priv, backbone_gw); } rcu_read_unlock(); } } /** * batadv_bla_send_loopdetect() - send a loopdetect frame * @bat_priv: the bat priv with all the soft interface information * @backbone_gw: the backbone gateway for which a loop should be detected * * To detect loops that the bridge loop avoidance can't handle, send a loop * detection packet on the backbone. Unlike other BLA frames, this frame will * be allowed on the mesh by other nodes. If it is received on the mesh, this * indicates that there is a loop. */ static void batadv_bla_send_loopdetect(struct batadv_priv *bat_priv, struct batadv_bla_backbone_gw *backbone_gw) { batadv_dbg(BATADV_DBG_BLA, bat_priv, "Send loopdetect frame for vid %d\n", backbone_gw->vid); batadv_bla_send_claim(bat_priv, bat_priv->bla.loopdetect_addr, backbone_gw->vid, BATADV_CLAIM_TYPE_LOOPDETECT); } /** * batadv_bla_status_update() - purge bla interfaces if necessary * @net_dev: the soft interface net device */ void batadv_bla_status_update(struct net_device *net_dev) { struct batadv_priv *bat_priv = netdev_priv(net_dev); struct batadv_hard_iface *primary_if; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) return; /* this function already purges everything when bla is disabled, * so just call that one. */ batadv_bla_update_orig_address(bat_priv, primary_if, primary_if); batadv_hardif_put(primary_if); } /** * batadv_bla_periodic_work() - performs periodic bla work * @work: kernel work struct * * periodic work to do: * * purge structures when they are too old * * send announcements */ static void batadv_bla_periodic_work(struct work_struct *work) { struct delayed_work *delayed_work; struct batadv_priv *bat_priv; struct batadv_priv_bla *priv_bla; struct hlist_head *head; struct batadv_bla_backbone_gw *backbone_gw; struct batadv_hashtable *hash; struct batadv_hard_iface *primary_if; bool send_loopdetect = false; int i; delayed_work = to_delayed_work(work); priv_bla = container_of(delayed_work, struct batadv_priv_bla, work); bat_priv = container_of(priv_bla, struct batadv_priv, bla); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto out; batadv_bla_purge_claims(bat_priv, primary_if, 0); batadv_bla_purge_backbone_gw(bat_priv, 0); if (!atomic_read(&bat_priv->bridge_loop_avoidance)) goto out; if (atomic_dec_and_test(&bat_priv->bla.loopdetect_next)) { /* set a new random mac address for the next bridge loop * detection frames. Set the locally administered bit to avoid * collisions with users mac addresses. */ eth_random_addr(bat_priv->bla.loopdetect_addr); bat_priv->bla.loopdetect_addr[0] = 0xba; bat_priv->bla.loopdetect_addr[1] = 0xbe; bat_priv->bla.loopdetect_lasttime = jiffies; atomic_set(&bat_priv->bla.loopdetect_next, BATADV_BLA_LOOPDETECT_PERIODS); /* mark for sending loop detect on all VLANs */ send_loopdetect = true; } hash = bat_priv->bla.backbone_hash; if (!hash) goto out; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) { if (!batadv_compare_eth(backbone_gw->orig, primary_if->net_dev->dev_addr)) continue; backbone_gw->lasttime = jiffies; batadv_bla_send_announce(bat_priv, backbone_gw); if (send_loopdetect) batadv_bla_send_loopdetect(bat_priv, backbone_gw); /* request_sent is only set after creation to avoid * problems when we are not yet known as backbone gw * in the backbone. * * We can reset this now after we waited some periods * to give bridge forward delays and bla group forming * some grace time. */ if (atomic_read(&backbone_gw->request_sent) == 0) continue; if (!atomic_dec_and_test(&backbone_gw->wait_periods)) continue; atomic_dec(&backbone_gw->bat_priv->bla.num_requests); atomic_set(&backbone_gw->request_sent, 0); } rcu_read_unlock(); } out: batadv_hardif_put(primary_if); queue_delayed_work(batadv_event_workqueue, &bat_priv->bla.work, msecs_to_jiffies(BATADV_BLA_PERIOD_LENGTH)); } /* The hash for claim and backbone hash receive the same key because they * are getting initialized by hash_new with the same key. Reinitializing * them with to different keys to allow nested locking without generating * lockdep warnings */ static struct lock_class_key batadv_claim_hash_lock_class_key; static struct lock_class_key batadv_backbone_hash_lock_class_key; /** * batadv_bla_init() - initialize all bla structures * @bat_priv: the bat priv with all the soft interface information * * Return: 0 on success, < 0 on error. */ int batadv_bla_init(struct batadv_priv *bat_priv) { int i; u8 claim_dest[ETH_ALEN] = {0xff, 0x43, 0x05, 0x00, 0x00, 0x00}; struct batadv_hard_iface *primary_if; u16 crc; unsigned long entrytime; spin_lock_init(&bat_priv->bla.bcast_duplist_lock); batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla hash registering\n"); /* setting claim destination address */ memcpy(&bat_priv->bla.claim_dest.magic, claim_dest, 3); bat_priv->bla.claim_dest.type = 0; primary_if = batadv_primary_if_get_selected(bat_priv); if (primary_if) { crc = crc16(0, primary_if->net_dev->dev_addr, ETH_ALEN); bat_priv->bla.claim_dest.group = htons(crc); batadv_hardif_put(primary_if); } else { bat_priv->bla.claim_dest.group = 0; /* will be set later */ } /* initialize the duplicate list */ entrytime = jiffies - msecs_to_jiffies(BATADV_DUPLIST_TIMEOUT); for (i = 0; i < BATADV_DUPLIST_SIZE; i++) bat_priv->bla.bcast_duplist[i].entrytime = entrytime; bat_priv->bla.bcast_duplist_curr = 0; atomic_set(&bat_priv->bla.loopdetect_next, BATADV_BLA_LOOPDETECT_PERIODS); if (bat_priv->bla.claim_hash) return 0; bat_priv->bla.claim_hash = batadv_hash_new(128); if (!bat_priv->bla.claim_hash) return -ENOMEM; bat_priv->bla.backbone_hash = batadv_hash_new(32); if (!bat_priv->bla.backbone_hash) { batadv_hash_destroy(bat_priv->bla.claim_hash); return -ENOMEM; } batadv_hash_set_lock_class(bat_priv->bla.claim_hash, &batadv_claim_hash_lock_class_key); batadv_hash_set_lock_class(bat_priv->bla.backbone_hash, &batadv_backbone_hash_lock_class_key); batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla hashes initialized\n"); INIT_DELAYED_WORK(&bat_priv->bla.work, batadv_bla_periodic_work); queue_delayed_work(batadv_event_workqueue, &bat_priv->bla.work, msecs_to_jiffies(BATADV_BLA_PERIOD_LENGTH)); return 0; } /** * batadv_bla_check_duplist() - Check if a frame is in the broadcast dup. * @bat_priv: the bat priv with all the soft interface information * @skb: contains the multicast packet to be checked * @payload_ptr: pointer to position inside the head buffer of the skb * marking the start of the data to be CRC'ed * @orig: originator mac address, NULL if unknown * * Check if it is on our broadcast list. Another gateway might have sent the * same packet because it is connected to the same backbone, so we have to * remove this duplicate. * * This is performed by checking the CRC, which will tell us * with a good chance that it is the same packet. If it is furthermore * sent by another host, drop it. We allow equal packets from * the same host however as this might be intended. * * Return: true if a packet is in the duplicate list, false otherwise. */ static bool batadv_bla_check_duplist(struct batadv_priv *bat_priv, struct sk_buff *skb, u8 *payload_ptr, const u8 *orig) { struct batadv_bcast_duplist_entry *entry; bool ret = false; int i, curr; __be32 crc; /* calculate the crc ... */ crc = batadv_skb_crc32(skb, payload_ptr); spin_lock_bh(&bat_priv->bla.bcast_duplist_lock); for (i = 0; i < BATADV_DUPLIST_SIZE; i++) { curr = (bat_priv->bla.bcast_duplist_curr + i); curr %= BATADV_DUPLIST_SIZE; entry = &bat_priv->bla.bcast_duplist[curr]; /* we can stop searching if the entry is too old ; * later entries will be even older */ if (batadv_has_timed_out(entry->entrytime, BATADV_DUPLIST_TIMEOUT)) break; if (entry->crc != crc) continue; /* are the originators both known and not anonymous? */ if (orig && !is_zero_ether_addr(orig) && !is_zero_ether_addr(entry->orig)) { /* If known, check if the new frame came from * the same originator: * We are safe to take identical frames from the * same orig, if known, as multiplications in * the mesh are detected via the (orig, seqno) pair. * So we can be a bit more liberal here and allow * identical frames from the same orig which the source * host might have sent multiple times on purpose. */ if (batadv_compare_eth(entry->orig, orig)) continue; } /* this entry seems to match: same crc, not too old, * and from another gw. therefore return true to forbid it. */ ret = true; goto out; } /* not found, add a new entry (overwrite the oldest entry) * and allow it, its the first occurrence. */ curr = (bat_priv->bla.bcast_duplist_curr + BATADV_DUPLIST_SIZE - 1); curr %= BATADV_DUPLIST_SIZE; entry = &bat_priv->bla.bcast_duplist[curr]; entry->crc = crc; entry->entrytime = jiffies; /* known originator */ if (orig) ether_addr_copy(entry->orig, orig); /* anonymous originator */ else eth_zero_addr(entry->orig); bat_priv->bla.bcast_duplist_curr = curr; out: spin_unlock_bh(&bat_priv->bla.bcast_duplist_lock); return ret; } /** * batadv_bla_check_ucast_duplist() - Check if a frame is in the broadcast dup. * @bat_priv: the bat priv with all the soft interface information * @skb: contains the multicast packet to be checked, decapsulated from a * unicast_packet * * Check if it is on our broadcast list. Another gateway might have sent the * same packet because it is connected to the same backbone, so we have to * remove this duplicate. * * Return: true if a packet is in the duplicate list, false otherwise. */ static bool batadv_bla_check_ucast_duplist(struct batadv_priv *bat_priv, struct sk_buff *skb) { return batadv_bla_check_duplist(bat_priv, skb, (u8 *)skb->data, NULL); } /** * batadv_bla_check_bcast_duplist() - Check if a frame is in the broadcast dup. * @bat_priv: the bat priv with all the soft interface information * @skb: contains the bcast_packet to be checked * * Check if it is on our broadcast list. Another gateway might have sent the * same packet because it is connected to the same backbone, so we have to * remove this duplicate. * * Return: true if a packet is in the duplicate list, false otherwise. */ bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, struct sk_buff *skb) { struct batadv_bcast_packet *bcast_packet; u8 *payload_ptr; bcast_packet = (struct batadv_bcast_packet *)skb->data; payload_ptr = (u8 *)(bcast_packet + 1); return batadv_bla_check_duplist(bat_priv, skb, payload_ptr, bcast_packet->orig); } /** * batadv_bla_is_backbone_gw_orig() - Check if the originator is a gateway for * the VLAN identified by vid. * @bat_priv: the bat priv with all the soft interface information * @orig: originator mac address * @vid: VLAN identifier * * Return: true if orig is a backbone for this vid, false otherwise. */ bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, unsigned short vid) { struct batadv_hashtable *hash = bat_priv->bla.backbone_hash; struct hlist_head *head; struct batadv_bla_backbone_gw *backbone_gw; int i; if (!atomic_read(&bat_priv->bridge_loop_avoidance)) return false; if (!hash) return false; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) { if (batadv_compare_eth(backbone_gw->orig, orig) && backbone_gw->vid == vid) { rcu_read_unlock(); return true; } } rcu_read_unlock(); } return false; } /** * batadv_bla_is_backbone_gw() - check if originator is a backbone gw for a VLAN * @skb: the frame to be checked * @orig_node: the orig_node of the frame * @hdr_size: maximum length of the frame * * Return: true if the orig_node is also a gateway on the soft interface, * otherwise it returns false. */ bool batadv_bla_is_backbone_gw(struct sk_buff *skb, struct batadv_orig_node *orig_node, int hdr_size) { struct batadv_bla_backbone_gw *backbone_gw; unsigned short vid; if (!atomic_read(&orig_node->bat_priv->bridge_loop_avoidance)) return false; /* first, find out the vid. */ if (!pskb_may_pull(skb, hdr_size + ETH_HLEN)) return false; vid = batadv_get_vid(skb, hdr_size); /* see if this originator is a backbone gw for this VLAN */ backbone_gw = batadv_backbone_hash_find(orig_node->bat_priv, orig_node->orig, vid); if (!backbone_gw) return false; batadv_backbone_gw_put(backbone_gw); return true; } /** * batadv_bla_free() - free all bla structures * @bat_priv: the bat priv with all the soft interface information * * for softinterface free or module unload */ void batadv_bla_free(struct batadv_priv *bat_priv) { struct batadv_hard_iface *primary_if; cancel_delayed_work_sync(&bat_priv->bla.work); primary_if = batadv_primary_if_get_selected(bat_priv); if (bat_priv->bla.claim_hash) { batadv_bla_purge_claims(bat_priv, primary_if, 1); batadv_hash_destroy(bat_priv->bla.claim_hash); bat_priv->bla.claim_hash = NULL; } if (bat_priv->bla.backbone_hash) { batadv_bla_purge_backbone_gw(bat_priv, 1); batadv_hash_destroy(bat_priv->bla.backbone_hash); bat_priv->bla.backbone_hash = NULL; } batadv_hardif_put(primary_if); } /** * batadv_bla_loopdetect_check() - check and handle a detected loop * @bat_priv: the bat priv with all the soft interface information * @skb: the packet to check * @primary_if: interface where the request came on * @vid: the VLAN ID of the frame * * Checks if this packet is a loop detect frame which has been sent by us, * throws an uevent and logs the event if that is the case. * * Return: true if it is a loop detect frame which is to be dropped, false * otherwise. */ static bool batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb, struct batadv_hard_iface *primary_if, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; struct ethhdr *ethhdr; bool ret; ethhdr = eth_hdr(skb); /* Only check for the MAC address and skip more checks here for * performance reasons - this function is on the hotpath, after all. */ if (!batadv_compare_eth(ethhdr->h_source, bat_priv->bla.loopdetect_addr)) return false; /* If the packet came too late, don't forward it on the mesh * but don't consider that as loop. It might be a coincidence. */ if (batadv_has_timed_out(bat_priv->bla.loopdetect_lasttime, BATADV_BLA_LOOPDETECT_TIMEOUT)) return true; backbone_gw = batadv_bla_get_backbone_gw(bat_priv, primary_if->net_dev->dev_addr, vid, true); if (unlikely(!backbone_gw)) return true; ret = queue_work(batadv_event_workqueue, &backbone_gw->report_work); /* backbone_gw is unreferenced in the report work function * if queue_work() call was successful */ if (!ret) batadv_backbone_gw_put(backbone_gw); return true; } /** * batadv_bla_rx() - check packets coming from the mesh. * @bat_priv: the bat priv with all the soft interface information * @skb: the frame to be checked * @vid: the VLAN ID of the frame * @packet_type: the batman packet type this frame came in * * batadv_bla_rx avoidance checks if: * * we have to race for a claim * * if the frame is allowed on the LAN * * In these cases, the skb is further handled by this function * * Return: true if handled, otherwise it returns false and the caller shall * further process the skb. */ bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, int packet_type) { struct batadv_bla_backbone_gw *backbone_gw; struct ethhdr *ethhdr; struct batadv_bla_claim search_claim, *claim = NULL; struct batadv_hard_iface *primary_if; bool own_claim; bool ret; ethhdr = eth_hdr(skb); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto handled; if (!atomic_read(&bat_priv->bridge_loop_avoidance)) goto allow; if (batadv_bla_loopdetect_check(bat_priv, skb, primary_if, vid)) goto handled; if (unlikely(atomic_read(&bat_priv->bla.num_requests))) /* don't allow multicast packets while requests are in flight */ if (is_multicast_ether_addr(ethhdr->h_dest)) /* Both broadcast flooding or multicast-via-unicasts * delivery might send to multiple backbone gateways * sharing the same LAN and therefore need to coordinate * which backbone gateway forwards into the LAN, * by claiming the payload source address. * * Broadcast flooding and multicast-via-unicasts * delivery use the following two batman packet types. * Note: explicitly exclude BATADV_UNICAST_4ADDR, * as the DHCP gateway feature will send explicitly * to only one BLA gateway, so the claiming process * should be avoided there. */ if (packet_type == BATADV_BCAST || packet_type == BATADV_UNICAST) goto handled; /* potential duplicates from foreign BLA backbone gateways via * multicast-in-unicast packets */ if (is_multicast_ether_addr(ethhdr->h_dest) && packet_type == BATADV_UNICAST && batadv_bla_check_ucast_duplist(bat_priv, skb)) goto handled; ether_addr_copy(search_claim.addr, ethhdr->h_source); search_claim.vid = vid; claim = batadv_claim_hash_find(bat_priv, &search_claim); if (!claim) { /* possible optimization: race for a claim */ /* No claim exists yet, claim it for us! */ batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): Unclaimed MAC %pM found. Claim it. Local: %s\n", __func__, ethhdr->h_source, batadv_is_my_client(bat_priv, ethhdr->h_source, vid) ? "yes" : "no"); batadv_handle_claim(bat_priv, primary_if, primary_if->net_dev->dev_addr, ethhdr->h_source, vid); goto allow; } /* if it is our own claim ... */ backbone_gw = batadv_bla_claim_get_backbone_gw(claim); own_claim = batadv_compare_eth(backbone_gw->orig, primary_if->net_dev->dev_addr); batadv_backbone_gw_put(backbone_gw); if (own_claim) { /* ... allow it in any case */ claim->lasttime = jiffies; goto allow; } /* if it is a multicast ... */ if (is_multicast_ether_addr(ethhdr->h_dest) && (packet_type == BATADV_BCAST || packet_type == BATADV_UNICAST)) { /* ... drop it. the responsible gateway is in charge. * * We need to check packet type because with the gateway * feature, broadcasts (like DHCP requests) may be sent * using a unicast 4 address packet type. See comment above. */ goto handled; } else { /* seems the client considers us as its best gateway. * send a claim and update the claim table * immediately. */ batadv_handle_claim(bat_priv, primary_if, primary_if->net_dev->dev_addr, ethhdr->h_source, vid); goto allow; } allow: batadv_bla_update_own_backbone_gw(bat_priv, primary_if, vid); ret = false; goto out; handled: kfree_skb(skb); ret = true; out: batadv_hardif_put(primary_if); batadv_claim_put(claim); return ret; } /** * batadv_bla_tx() - check packets going into the mesh * @bat_priv: the bat priv with all the soft interface information * @skb: the frame to be checked * @vid: the VLAN ID of the frame * * batadv_bla_tx checks if: * * a claim was received which has to be processed * * the frame is allowed on the mesh * * in these cases, the skb is further handled by this function. * * This call might reallocate skb data. * * Return: true if handled, otherwise it returns false and the caller shall * further process the skb. */ bool batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct ethhdr *ethhdr; struct batadv_bla_claim search_claim, *claim = NULL; struct batadv_bla_backbone_gw *backbone_gw; struct batadv_hard_iface *primary_if; bool client_roamed; bool ret = false; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto out; if (!atomic_read(&bat_priv->bridge_loop_avoidance)) goto allow; if (batadv_bla_process_claim(bat_priv, primary_if, skb)) goto handled; ethhdr = eth_hdr(skb); if (unlikely(atomic_read(&bat_priv->bla.num_requests))) /* don't allow broadcasts while requests are in flight */ if (is_multicast_ether_addr(ethhdr->h_dest)) goto handled; ether_addr_copy(search_claim.addr, ethhdr->h_source); search_claim.vid = vid; claim = batadv_claim_hash_find(bat_priv, &search_claim); /* if no claim exists, allow it. */ if (!claim) goto allow; /* check if we are responsible. */ backbone_gw = batadv_bla_claim_get_backbone_gw(claim); client_roamed = batadv_compare_eth(backbone_gw->orig, primary_if->net_dev->dev_addr); batadv_backbone_gw_put(backbone_gw); if (client_roamed) { /* if yes, the client has roamed and we have * to unclaim it. */ if (batadv_has_timed_out(claim->lasttime, 100)) { /* only unclaim if the last claim entry is * older than 100 ms to make sure we really * have a roaming client here. */ batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): Roaming client %pM detected. Unclaim it.\n", __func__, ethhdr->h_source); batadv_handle_unclaim(bat_priv, primary_if, primary_if->net_dev->dev_addr, ethhdr->h_source, vid); goto allow; } else { batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): Race for claim %pM detected. Drop packet.\n", __func__, ethhdr->h_source); goto handled; } } /* check if it is a multicast/broadcast frame */ if (is_multicast_ether_addr(ethhdr->h_dest)) { /* drop it. the responsible gateway has forwarded it into * the backbone network. */ goto handled; } else { /* we must allow it. at least if we are * responsible for the DESTINATION. */ goto allow; } allow: batadv_bla_update_own_backbone_gw(bat_priv, primary_if, vid); ret = false; goto out; handled: ret = true; out: batadv_hardif_put(primary_if); batadv_claim_put(claim); return ret; } /** * batadv_bla_claim_dump_entry() - dump one entry of the claim table * to a netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @primary_if: primary interface * @claim: entry to dump * * Return: 0 or error code. */ static int batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_hard_iface *primary_if, struct batadv_bla_claim *claim) { const u8 *primary_addr = primary_if->net_dev->dev_addr; u16 backbone_crc; bool is_own; void *hdr; int ret = -EINVAL; hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_BLA_CLAIM); if (!hdr) { ret = -ENOBUFS; goto out; } genl_dump_check_consistent(cb, hdr); is_own = batadv_compare_eth(claim->backbone_gw->orig, primary_addr); spin_lock_bh(&claim->backbone_gw->crc_lock); backbone_crc = claim->backbone_gw->crc; spin_unlock_bh(&claim->backbone_gw->crc_lock); if (is_own) if (nla_put_flag(msg, BATADV_ATTR_BLA_OWN)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put(msg, BATADV_ATTR_BLA_ADDRESS, ETH_ALEN, claim->addr) || nla_put_u16(msg, BATADV_ATTR_BLA_VID, claim->vid) || nla_put(msg, BATADV_ATTR_BLA_BACKBONE, ETH_ALEN, claim->backbone_gw->orig) || nla_put_u16(msg, BATADV_ATTR_BLA_CRC, backbone_crc)) { genlmsg_cancel(msg, hdr); goto out; } genlmsg_end(msg, hdr); ret = 0; out: return ret; } /** * batadv_bla_claim_dump_bucket() - dump one bucket of the claim table * to a netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @primary_if: primary interface * @hash: hash to dump * @bucket: bucket index to dump * @idx_skip: How many entries to skip * * Return: always 0. */ static int batadv_bla_claim_dump_bucket(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_hard_iface *primary_if, struct batadv_hashtable *hash, unsigned int bucket, int *idx_skip) { struct batadv_bla_claim *claim; int idx = 0; int ret = 0; spin_lock_bh(&hash->list_locks[bucket]); cb->seq = atomic_read(&hash->generation) << 1 | 1; hlist_for_each_entry(claim, &hash->table[bucket], hash_entry) { if (idx++ < *idx_skip) continue; ret = batadv_bla_claim_dump_entry(msg, portid, cb, primary_if, claim); if (ret) { *idx_skip = idx - 1; goto unlock; } } *idx_skip = 0; unlock: spin_unlock_bh(&hash->list_locks[bucket]); return ret; } /** * batadv_bla_claim_dump() - dump claim table to a netlink socket * @msg: buffer for the message * @cb: callback structure containing arguments * * Return: message length. */ int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct batadv_hard_iface *primary_if = NULL; int portid = NETLINK_CB(cb->skb).portid; struct net *net = sock_net(cb->skb->sk); struct net_device *soft_iface; struct batadv_hashtable *hash; struct batadv_priv *bat_priv; int bucket = cb->args[0]; int idx = cb->args[1]; int ifindex; int ret = 0; ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); if (!ifindex) return -EINVAL; soft_iface = dev_get_by_index(net, ifindex); if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { ret = -ENODEV; goto out; } bat_priv = netdev_priv(soft_iface); hash = bat_priv->bla.claim_hash; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; goto out; } while (bucket < hash->size) { if (batadv_bla_claim_dump_bucket(msg, portid, cb, primary_if, hash, bucket, &idx)) break; bucket++; } cb->args[0] = bucket; cb->args[1] = idx; ret = msg->len; out: batadv_hardif_put(primary_if); dev_put(soft_iface); return ret; } /** * batadv_bla_backbone_dump_entry() - dump one entry of the backbone table to a * netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @primary_if: primary interface * @backbone_gw: entry to dump * * Return: 0 or error code. */ static int batadv_bla_backbone_dump_entry(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_hard_iface *primary_if, struct batadv_bla_backbone_gw *backbone_gw) { const u8 *primary_addr = primary_if->net_dev->dev_addr; u16 backbone_crc; bool is_own; int msecs; void *hdr; int ret = -EINVAL; hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_BLA_BACKBONE); if (!hdr) { ret = -ENOBUFS; goto out; } genl_dump_check_consistent(cb, hdr); is_own = batadv_compare_eth(backbone_gw->orig, primary_addr); spin_lock_bh(&backbone_gw->crc_lock); backbone_crc = backbone_gw->crc; spin_unlock_bh(&backbone_gw->crc_lock); msecs = jiffies_to_msecs(jiffies - backbone_gw->lasttime); if (is_own) if (nla_put_flag(msg, BATADV_ATTR_BLA_OWN)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put(msg, BATADV_ATTR_BLA_BACKBONE, ETH_ALEN, backbone_gw->orig) || nla_put_u16(msg, BATADV_ATTR_BLA_VID, backbone_gw->vid) || nla_put_u16(msg, BATADV_ATTR_BLA_CRC, backbone_crc) || nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, msecs)) { genlmsg_cancel(msg, hdr); goto out; } genlmsg_end(msg, hdr); ret = 0; out: return ret; } /** * batadv_bla_backbone_dump_bucket() - dump one bucket of the backbone table to * a netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @primary_if: primary interface * @hash: hash to dump * @bucket: bucket index to dump * @idx_skip: How many entries to skip * * Return: always 0. */ static int batadv_bla_backbone_dump_bucket(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_hard_iface *primary_if, struct batadv_hashtable *hash, unsigned int bucket, int *idx_skip) { struct batadv_bla_backbone_gw *backbone_gw; int idx = 0; int ret = 0; spin_lock_bh(&hash->list_locks[bucket]); cb->seq = atomic_read(&hash->generation) << 1 | 1; hlist_for_each_entry(backbone_gw, &hash->table[bucket], hash_entry) { if (idx++ < *idx_skip) continue; ret = batadv_bla_backbone_dump_entry(msg, portid, cb, primary_if, backbone_gw); if (ret) { *idx_skip = idx - 1; goto unlock; } } *idx_skip = 0; unlock: spin_unlock_bh(&hash->list_locks[bucket]); return ret; } /** * batadv_bla_backbone_dump() - dump backbone table to a netlink socket * @msg: buffer for the message * @cb: callback structure containing arguments * * Return: message length. */ int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct batadv_hard_iface *primary_if = NULL; int portid = NETLINK_CB(cb->skb).portid; struct net *net = sock_net(cb->skb->sk); struct net_device *soft_iface; struct batadv_hashtable *hash; struct batadv_priv *bat_priv; int bucket = cb->args[0]; int idx = cb->args[1]; int ifindex; int ret = 0; ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); if (!ifindex) return -EINVAL; soft_iface = dev_get_by_index(net, ifindex); if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { ret = -ENODEV; goto out; } bat_priv = netdev_priv(soft_iface); hash = bat_priv->bla.backbone_hash; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; goto out; } while (bucket < hash->size) { if (batadv_bla_backbone_dump_bucket(msg, portid, cb, primary_if, hash, bucket, &idx)) break; bucket++; } cb->args[0] = bucket; cb->args[1] = idx; ret = msg->len; out: batadv_hardif_put(primary_if); dev_put(soft_iface); return ret; } #ifdef CONFIG_BATMAN_ADV_DAT /** * batadv_bla_check_claim() - check if address is claimed * * @bat_priv: the bat priv with all the soft interface information * @addr: mac address of which the claim status is checked * @vid: the VLAN ID * * addr is checked if this address is claimed by the local device itself. * * Return: true if bla is disabled or the mac is claimed by the device, * false if the device addr is already claimed by another gateway */ bool batadv_bla_check_claim(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid) { struct batadv_bla_claim search_claim; struct batadv_bla_claim *claim = NULL; struct batadv_hard_iface *primary_if = NULL; bool ret = true; if (!atomic_read(&bat_priv->bridge_loop_avoidance)) return ret; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) return ret; /* First look if the mac address is claimed */ ether_addr_copy(search_claim.addr, addr); search_claim.vid = vid; claim = batadv_claim_hash_find(bat_priv, &search_claim); /* If there is a claim and we are not owner of the claim, * return false. */ if (claim) { if (!batadv_compare_eth(claim->backbone_gw->orig, primary_if->net_dev->dev_addr)) ret = false; batadv_claim_put(claim); } batadv_hardif_put(primary_if); return ret; } #endif
72 72 2 2 2 2 2 2 117 117 117 2 1 37 37 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 // SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * * This module enables kernel and guest-mode vCPU access to guest physical * memory with suitable invalidation mechanisms. * * Copyright © 2021 Amazon.com, Inc. or its affiliates. * * Authors: * David Woodhouse <dwmw2@infradead.org> */ #include <linux/kvm_host.h> #include <linux/kvm.h> #include <linux/highmem.h> #include <linux/module.h> #include <linux/errno.h> #include "kvm_mm.h" /* * MMU notifier 'invalidate_range_start' hook. */ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start, unsigned long end, bool may_block) { DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS); struct gfn_to_pfn_cache *gpc; bool evict_vcpus = false; spin_lock(&kvm->gpc_lock); list_for_each_entry(gpc, &kvm->gpc_list, list) { write_lock_irq(&gpc->lock); /* Only a single page so no need to care about length */ if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) && gpc->uhva >= start && gpc->uhva < end) { gpc->valid = false; /* * If a guest vCPU could be using the physical address, * it needs to be forced out of guest mode. */ if (gpc->usage & KVM_GUEST_USES_PFN) { if (!evict_vcpus) { evict_vcpus = true; bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS); } __set_bit(gpc->vcpu->vcpu_idx, vcpu_bitmap); } } write_unlock_irq(&gpc->lock); } spin_unlock(&kvm->gpc_lock); if (evict_vcpus) { /* * KVM needs to ensure the vCPU is fully out of guest context * before allowing the invalidation to continue. */ unsigned int req = KVM_REQ_OUTSIDE_GUEST_MODE; bool called; /* * If the OOM reaper is active, then all vCPUs should have * been stopped already, so perform the request without * KVM_REQUEST_WAIT and be sad if any needed to be IPI'd. */ if (!may_block) req &= ~KVM_REQUEST_WAIT; called = kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap); WARN_ON_ONCE(called && !may_block); } } bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len) { struct kvm_memslots *slots = kvm_memslots(gpc->kvm); if (!gpc->active) return false; if ((gpc->gpa & ~PAGE_MASK) + len > PAGE_SIZE) return false; if (gpc->generation != slots->generation || kvm_is_error_hva(gpc->uhva)) return false; if (!gpc->valid) return false; return true; } EXPORT_SYMBOL_GPL(kvm_gpc_check); static void gpc_unmap_khva(kvm_pfn_t pfn, void *khva) { /* Unmap the old pfn/page if it was mapped before. */ if (!is_error_noslot_pfn(pfn) && khva) { if (pfn_valid(pfn)) kunmap(pfn_to_page(pfn)); #ifdef CONFIG_HAS_IOMEM else memunmap(khva); #endif } } static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_seq) { /* * mn_active_invalidate_count acts for all intents and purposes * like mmu_invalidate_in_progress here; but the latter cannot * be used here because the invalidation of caches in the * mmu_notifier event occurs _before_ mmu_invalidate_in_progress * is elevated. * * Note, it does not matter that mn_active_invalidate_count * is not protected by gpc->lock. It is guaranteed to * be elevated before the mmu_notifier acquires gpc->lock, and * isn't dropped until after mmu_invalidate_seq is updated. */ if (kvm->mn_active_invalidate_count) return true; /* * Ensure mn_active_invalidate_count is read before * mmu_invalidate_seq. This pairs with the smp_wmb() in * mmu_notifier_invalidate_range_end() to guarantee either the * old (non-zero) value of mn_active_invalidate_count or the * new (incremented) value of mmu_invalidate_seq is observed. */ smp_rmb(); return kvm->mmu_invalidate_seq != mmu_seq; } static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) { /* Note, the new page offset may be different than the old! */ void *old_khva = gpc->khva - offset_in_page(gpc->khva); kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT; void *new_khva = NULL; unsigned long mmu_seq; lockdep_assert_held(&gpc->refresh_lock); lockdep_assert_held_write(&gpc->lock); /* * Invalidate the cache prior to dropping gpc->lock, the gpa=>uhva * assets have already been updated and so a concurrent check() from a * different task may not fail the gpa/uhva/generation checks. */ gpc->valid = false; do { mmu_seq = gpc->kvm->mmu_invalidate_seq; smp_rmb(); write_unlock_irq(&gpc->lock); /* * If the previous iteration "failed" due to an mmu_notifier * event, release the pfn and unmap the kernel virtual address * from the previous attempt. Unmapping might sleep, so this * needs to be done after dropping the lock. Opportunistically * check for resched while the lock isn't held. */ if (new_pfn != KVM_PFN_ERR_FAULT) { /* * Keep the mapping if the previous iteration reused * the existing mapping and didn't create a new one. */ if (new_khva != old_khva) gpc_unmap_khva(new_pfn, new_khva); kvm_release_pfn_clean(new_pfn); cond_resched(); } /* We always request a writeable mapping */ new_pfn = hva_to_pfn(gpc->uhva, false, false, NULL, true, NULL); if (is_error_noslot_pfn(new_pfn)) goto out_error; /* * Obtain a new kernel mapping if KVM itself will access the * pfn. Note, kmap() and memremap() can both sleep, so this * too must be done outside of gpc->lock! */ if (gpc->usage & KVM_HOST_USES_PFN) { if (new_pfn == gpc->pfn) { new_khva = old_khva; } else if (pfn_valid(new_pfn)) { new_khva = kmap(pfn_to_page(new_pfn)); #ifdef CONFIG_HAS_IOMEM } else { new_khva = memremap(pfn_to_hpa(new_pfn), PAGE_SIZE, MEMREMAP_WB); #endif } if (!new_khva) { kvm_release_pfn_clean(new_pfn); goto out_error; } } write_lock_irq(&gpc->lock); /* * Other tasks must wait for _this_ refresh to complete before * attempting to refresh. */ WARN_ON_ONCE(gpc->valid); } while (mmu_notifier_retry_cache(gpc->kvm, mmu_seq)); gpc->valid = true; gpc->pfn = new_pfn; gpc->khva = new_khva + (gpc->gpa & ~PAGE_MASK); /* * Put the reference to the _new_ pfn. The pfn is now tracked by the * cache and can be safely migrated, swapped, etc... as the cache will * invalidate any mappings in response to relevant mmu_notifier events. */ kvm_release_pfn_clean(new_pfn); return 0; out_error: write_lock_irq(&gpc->lock); return -EFAULT; } static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) { struct kvm_memslots *slots = kvm_memslots(gpc->kvm); unsigned long page_offset = gpa & ~PAGE_MASK; bool unmap_old = false; unsigned long old_uhva; kvm_pfn_t old_pfn; void *old_khva; int ret; /* * If must fit within a single page. The 'len' argument is * only to enforce that. */ if (page_offset + len > PAGE_SIZE) return -EINVAL; /* * If another task is refreshing the cache, wait for it to complete. * There is no guarantee that concurrent refreshes will see the same * gpa, memslots generation, etc..., so they must be fully serialized. */ mutex_lock(&gpc->refresh_lock); write_lock_irq(&gpc->lock); if (!gpc->active) { ret = -EINVAL; goto out_unlock; } old_pfn = gpc->pfn; old_khva = gpc->khva - offset_in_page(gpc->khva); old_uhva = gpc->uhva; /* If the userspace HVA is invalid, refresh that first */ if (gpc->gpa != gpa || gpc->generation != slots->generation || kvm_is_error_hva(gpc->uhva)) { gfn_t gfn = gpa_to_gfn(gpa); gpc->gpa = gpa; gpc->generation = slots->generation; gpc->memslot = __gfn_to_memslot(slots, gfn); gpc->uhva = gfn_to_hva_memslot(gpc->memslot, gfn); if (kvm_is_error_hva(gpc->uhva)) { ret = -EFAULT; goto out; } } /* * If the userspace HVA changed or the PFN was already invalid, * drop the lock and do the HVA to PFN lookup again. */ if (!gpc->valid || old_uhva != gpc->uhva) { ret = hva_to_pfn_retry(gpc); } else { /* * If the HVA→PFN mapping was already valid, don't unmap it. * But do update gpc->khva because the offset within the page * may have changed. */ gpc->khva = old_khva + page_offset; ret = 0; goto out_unlock; } out: /* * Invalidate the cache and purge the pfn/khva if the refresh failed. * Some/all of the uhva, gpa, and memslot generation info may still be * valid, leave it as is. */ if (ret) { gpc->valid = false; gpc->pfn = KVM_PFN_ERR_FAULT; gpc->khva = NULL; } /* Detect a pfn change before dropping the lock! */ unmap_old = (old_pfn != gpc->pfn); out_unlock: write_unlock_irq(&gpc->lock); mutex_unlock(&gpc->refresh_lock); if (unmap_old) gpc_unmap_khva(old_pfn, old_khva); return ret; } int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsigned long len) { return __kvm_gpc_refresh(gpc, gpc->gpa, len); } EXPORT_SYMBOL_GPL(kvm_gpc_refresh); void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm, struct kvm_vcpu *vcpu, enum pfn_cache_usage usage) { WARN_ON_ONCE(!usage || (usage & KVM_GUEST_AND_HOST_USE_PFN) != usage); WARN_ON_ONCE((usage & KVM_GUEST_USES_PFN) && !vcpu); rwlock_init(&gpc->lock); mutex_init(&gpc->refresh_lock); gpc->kvm = kvm; gpc->vcpu = vcpu; gpc->usage = usage; gpc->pfn = KVM_PFN_ERR_FAULT; gpc->uhva = KVM_HVA_ERR_BAD; } EXPORT_SYMBOL_GPL(kvm_gpc_init); int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) { struct kvm *kvm = gpc->kvm; if (!gpc->active) { if (KVM_BUG_ON(gpc->valid, kvm)) return -EIO; spin_lock(&kvm->gpc_lock); list_add(&gpc->list, &kvm->gpc_list); spin_unlock(&kvm->gpc_lock); /* * Activate the cache after adding it to the list, a concurrent * refresh must not establish a mapping until the cache is * reachable by mmu_notifier events. */ write_lock_irq(&gpc->lock); gpc->active = true; write_unlock_irq(&gpc->lock); } return __kvm_gpc_refresh(gpc, gpa, len); } EXPORT_SYMBOL_GPL(kvm_gpc_activate); void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc) { struct kvm *kvm = gpc->kvm; kvm_pfn_t old_pfn; void *old_khva; if (gpc->active) { /* * Deactivate the cache before removing it from the list, KVM * must stall mmu_notifier events until all users go away, i.e. * until gpc->lock is dropped and refresh is guaranteed to fail. */ write_lock_irq(&gpc->lock); gpc->active = false; gpc->valid = false; /* * Leave the GPA => uHVA cache intact, it's protected by the * memslot generation. The PFN lookup needs to be redone every * time as mmu_notifier protection is lost when the cache is * removed from the VM's gpc_list. */ old_khva = gpc->khva - offset_in_page(gpc->khva); gpc->khva = NULL; old_pfn = gpc->pfn; gpc->pfn = KVM_PFN_ERR_FAULT; write_unlock_irq(&gpc->lock); spin_lock(&kvm->gpc_lock); list_del(&gpc->list); spin_unlock(&kvm->gpc_lock); gpc_unmap_khva(old_pfn, old_khva); } } EXPORT_SYMBOL_GPL(kvm_gpc_deactivate);
8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 // SPDX-License-Identifier: GPL-2.0-only #include <linux/clockchips.h> #include <linux/interrupt.h> #include <linux/export.h> #include <linux/delay.h> #include <linux/hpet.h> #include <linux/cpu.h> #include <linux/irq.h> #include <asm/irq_remapping.h> #include <asm/hpet.h> #include <asm/time.h> #include <asm/mwait.h> #undef pr_fmt #define pr_fmt(fmt) "hpet: " fmt enum hpet_mode { HPET_MODE_UNUSED, HPET_MODE_LEGACY, HPET_MODE_CLOCKEVT, HPET_MODE_DEVICE, }; struct hpet_channel { struct clock_event_device evt; unsigned int num; unsigned int cpu; unsigned int irq; unsigned int in_use; enum hpet_mode mode; unsigned int boot_cfg; char name[10]; }; struct hpet_base { unsigned int nr_channels; unsigned int nr_clockevents; unsigned int boot_cfg; struct hpet_channel *channels; }; #define HPET_MASK CLOCKSOURCE_MASK(32) #define HPET_MIN_CYCLES 128 #define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1)) /* * HPET address is set in acpi/boot.c, when an ACPI entry exists */ unsigned long hpet_address; u8 hpet_blockid; /* OS timer block num */ bool hpet_msi_disable; #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_GENERIC_MSI_IRQ) static DEFINE_PER_CPU(struct hpet_channel *, cpu_hpet_channel); static struct irq_domain *hpet_domain; #endif static void __iomem *hpet_virt_address; static struct hpet_base hpet_base; static bool hpet_legacy_int_enabled; static unsigned long hpet_freq; bool boot_hpet_disable; bool hpet_force_user; static bool hpet_verbose; static inline struct hpet_channel *clockevent_to_channel(struct clock_event_device *evt) { return container_of(evt, struct hpet_channel, evt); } inline unsigned int hpet_readl(unsigned int a) { return readl(hpet_virt_address + a); } static inline void hpet_writel(unsigned int d, unsigned int a) { writel(d, hpet_virt_address + a); } static inline void hpet_set_mapping(void) { hpet_virt_address = ioremap(hpet_address, HPET_MMAP_SIZE); } static inline void hpet_clear_mapping(void) { iounmap(hpet_virt_address); hpet_virt_address = NULL; } /* * HPET command line enable / disable */ static int __init hpet_setup(char *str) { while (str) { char *next = strchr(str, ','); if (next) *next++ = 0; if (!strncmp("disable", str, 7)) boot_hpet_disable = true; if (!strncmp("force", str, 5)) hpet_force_user = true; if (!strncmp("verbose", str, 7)) hpet_verbose = true; str = next; } return 1; } __setup("hpet=", hpet_setup); static int __init disable_hpet(char *str) { boot_hpet_disable = true; return 1; } __setup("nohpet", disable_hpet); static inline int is_hpet_capable(void) { return !boot_hpet_disable && hpet_address; } /** * is_hpet_enabled - Check whether the legacy HPET timer interrupt is enabled */ int is_hpet_enabled(void) { return is_hpet_capable() && hpet_legacy_int_enabled; } EXPORT_SYMBOL_GPL(is_hpet_enabled); static void _hpet_print_config(const char *function, int line) { u32 i, id, period, cfg, status, channels, l, h; pr_info("%s(%d):\n", function, line); id = hpet_readl(HPET_ID); period = hpet_readl(HPET_PERIOD); pr_info("ID: 0x%x, PERIOD: 0x%x\n", id, period); cfg = hpet_readl(HPET_CFG); status = hpet_readl(HPET_STATUS); pr_info("CFG: 0x%x, STATUS: 0x%x\n", cfg, status); l = hpet_readl(HPET_COUNTER); h = hpet_readl(HPET_COUNTER+4); pr_info("COUNTER_l: 0x%x, COUNTER_h: 0x%x\n", l, h); channels = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; for (i = 0; i < channels; i++) { l = hpet_readl(HPET_Tn_CFG(i)); h = hpet_readl(HPET_Tn_CFG(i)+4); pr_info("T%d: CFG_l: 0x%x, CFG_h: 0x%x\n", i, l, h); l = hpet_readl(HPET_Tn_CMP(i)); h = hpet_readl(HPET_Tn_CMP(i)+4); pr_info("T%d: CMP_l: 0x%x, CMP_h: 0x%x\n", i, l, h); l = hpet_readl(HPET_Tn_ROUTE(i)); h = hpet_readl(HPET_Tn_ROUTE(i)+4); pr_info("T%d ROUTE_l: 0x%x, ROUTE_h: 0x%x\n", i, l, h); } } #define hpet_print_config() \ do { \ if (hpet_verbose) \ _hpet_print_config(__func__, __LINE__); \ } while (0) /* * When the HPET driver (/dev/hpet) is enabled, we need to reserve * timer 0 and timer 1 in case of RTC emulation. */ #ifdef CONFIG_HPET static void __init hpet_reserve_platform_timers(void) { struct hpet_data hd; unsigned int i; memset(&hd, 0, sizeof(hd)); hd.hd_phys_address = hpet_address; hd.hd_address = hpet_virt_address; hd.hd_nirqs = hpet_base.nr_channels; /* * NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254 * is wrong for i8259!) not the output IRQ. Many BIOS writers * don't bother configuring *any* comparator interrupts. */ hd.hd_irq[0] = HPET_LEGACY_8254; hd.hd_irq[1] = HPET_LEGACY_RTC; for (i = 0; i < hpet_base.nr_channels; i++) { struct hpet_channel *hc = hpet_base.channels + i; if (i >= 2) hd.hd_irq[i] = hc->irq; switch (hc->mode) { case HPET_MODE_UNUSED: case HPET_MODE_DEVICE: hc->mode = HPET_MODE_DEVICE; break; case HPET_MODE_CLOCKEVT: case HPET_MODE_LEGACY: hpet_reserve_timer(&hd, hc->num); break; } } hpet_alloc(&hd); } static void __init hpet_select_device_channel(void) { int i; for (i = 0; i < hpet_base.nr_channels; i++) { struct hpet_channel *hc = hpet_base.channels + i; /* Associate the first unused channel to /dev/hpet */ if (hc->mode == HPET_MODE_UNUSED) { hc->mode = HPET_MODE_DEVICE; return; } } } #else static inline void hpet_reserve_platform_timers(void) { } static inline void hpet_select_device_channel(void) {} #endif /* Common HPET functions */ static void hpet_stop_counter(void) { u32 cfg = hpet_readl(HPET_CFG); cfg &= ~HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); } static void hpet_reset_counter(void) { hpet_writel(0, HPET_COUNTER); hpet_writel(0, HPET_COUNTER + 4); } static void hpet_start_counter(void) { unsigned int cfg = hpet_readl(HPET_CFG); cfg |= HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); } static void hpet_restart_counter(void) { hpet_stop_counter(); hpet_reset_counter(); hpet_start_counter(); } static void hpet_resume_device(void) { force_hpet_resume(); } static void hpet_resume_counter(struct clocksource *cs) { hpet_resume_device(); hpet_restart_counter(); } static void hpet_enable_legacy_int(void) { unsigned int cfg = hpet_readl(HPET_CFG); cfg |= HPET_CFG_LEGACY; hpet_writel(cfg, HPET_CFG); hpet_legacy_int_enabled = true; } static int hpet_clkevt_set_state_periodic(struct clock_event_device *evt) { unsigned int channel = clockevent_to_channel(evt)->num; unsigned int cfg, cmp, now; uint64_t delta; hpet_stop_counter(); delta = ((uint64_t)(NSEC_PER_SEC / HZ)) * evt->mult; delta >>= evt->shift; now = hpet_readl(HPET_COUNTER); cmp = now + (unsigned int)delta; cfg = hpet_readl(HPET_Tn_CFG(channel)); cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT; hpet_writel(cfg, HPET_Tn_CFG(channel)); hpet_writel(cmp, HPET_Tn_CMP(channel)); udelay(1); /* * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL * bit is automatically cleared after the first write. * (See AMD-8111 HyperTransport I/O Hub Data Sheet, * Publication # 24674) */ hpet_writel((unsigned int)delta, HPET_Tn_CMP(channel)); hpet_start_counter(); hpet_print_config(); return 0; } static int hpet_clkevt_set_state_oneshot(struct clock_event_device *evt) { unsigned int channel = clockevent_to_channel(evt)->num; unsigned int cfg; cfg = hpet_readl(HPET_Tn_CFG(channel)); cfg &= ~HPET_TN_PERIODIC; cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; hpet_writel(cfg, HPET_Tn_CFG(channel)); return 0; } static int hpet_clkevt_set_state_shutdown(struct clock_event_device *evt) { unsigned int channel = clockevent_to_channel(evt)->num; unsigned int cfg; cfg = hpet_readl(HPET_Tn_CFG(channel)); cfg &= ~HPET_TN_ENABLE; hpet_writel(cfg, HPET_Tn_CFG(channel)); return 0; } static int hpet_clkevt_legacy_resume(struct clock_event_device *evt) { hpet_enable_legacy_int(); hpet_print_config(); return 0; } static int hpet_clkevt_set_next_event(unsigned long delta, struct clock_event_device *evt) { unsigned int channel = clockevent_to_channel(evt)->num; u32 cnt; s32 res; cnt = hpet_readl(HPET_COUNTER); cnt += (u32) delta; hpet_writel(cnt, HPET_Tn_CMP(channel)); /* * HPETs are a complete disaster. The compare register is * based on a equal comparison and neither provides a less * than or equal functionality (which would require to take * the wraparound into account) nor a simple count down event * mode. Further the write to the comparator register is * delayed internally up to two HPET clock cycles in certain * chipsets (ATI, ICH9,10). Some newer AMD chipsets have even * longer delays. We worked around that by reading back the * compare register, but that required another workaround for * ICH9,10 chips where the first readout after write can * return the old stale value. We already had a minimum * programming delta of 5us enforced, but a NMI or SMI hitting * between the counter readout and the comparator write can * move us behind that point easily. Now instead of reading * the compare register back several times, we make the ETIME * decision based on the following: Return ETIME if the * counter value after the write is less than HPET_MIN_CYCLES * away from the event or if the counter is already ahead of * the event. The minimum programming delta for the generic * clockevents code is set to 1.5 * HPET_MIN_CYCLES. */ res = (s32)(cnt - hpet_readl(HPET_COUNTER)); return res < HPET_MIN_CYCLES ? -ETIME : 0; } static void hpet_init_clockevent(struct hpet_channel *hc, unsigned int rating) { struct clock_event_device *evt = &hc->evt; evt->rating = rating; evt->irq = hc->irq; evt->name = hc->name; evt->cpumask = cpumask_of(hc->cpu); evt->set_state_oneshot = hpet_clkevt_set_state_oneshot; evt->set_next_event = hpet_clkevt_set_next_event; evt->set_state_shutdown = hpet_clkevt_set_state_shutdown; evt->features = CLOCK_EVT_FEAT_ONESHOT; if (hc->boot_cfg & HPET_TN_PERIODIC) { evt->features |= CLOCK_EVT_FEAT_PERIODIC; evt->set_state_periodic = hpet_clkevt_set_state_periodic; } } static void __init hpet_legacy_clockevent_register(struct hpet_channel *hc) { /* * Start HPET with the boot CPU's cpumask and make it global after * the IO_APIC has been initialized. */ hc->cpu = boot_cpu_data.cpu_index; strscpy(hc->name, "hpet", sizeof(hc->name)); hpet_init_clockevent(hc, 50); hc->evt.tick_resume = hpet_clkevt_legacy_resume; /* * Legacy horrors and sins from the past. HPET used periodic mode * unconditionally forever on the legacy channel 0. Removing the * below hack and using the conditional in hpet_init_clockevent() * makes at least Qemu and one hardware machine fail to boot. * There are two issues which cause the boot failure: * * #1 After the timer delivery test in IOAPIC and the IOAPIC setup * the next interrupt is not delivered despite the HPET channel * being programmed correctly. Reprogramming the HPET after * switching to IOAPIC makes it work again. After fixing this, * the next issue surfaces: * * #2 Due to the unconditional periodic mode availability the Local * APIC timer calibration can hijack the global clockevents * event handler without causing damage. Using oneshot at this * stage makes if hang because the HPET does not get * reprogrammed due to the handler hijacking. Duh, stupid me! * * Both issues require major surgery and especially the kick HPET * again after enabling IOAPIC results in really nasty hackery. * This 'assume periodic works' magic has survived since HPET * support got added, so it's questionable whether this should be * fixed. Both Qemu and the failing hardware machine support * periodic mode despite the fact that both don't advertise it in * the configuration register and both need that extra kick after * switching to IOAPIC. Seems to be a feature... */ hc->evt.features |= CLOCK_EVT_FEAT_PERIODIC; hc->evt.set_state_periodic = hpet_clkevt_set_state_periodic; /* Start HPET legacy interrupts */ hpet_enable_legacy_int(); clockevents_config_and_register(&hc->evt, hpet_freq, HPET_MIN_PROG_DELTA, 0x7FFFFFFF); global_clock_event = &hc->evt; pr_debug("Clockevent registered\n"); } /* * HPET MSI Support */ #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_GENERIC_MSI_IRQ) static void hpet_msi_unmask(struct irq_data *data) { struct hpet_channel *hc = irq_data_get_irq_handler_data(data); unsigned int cfg; cfg = hpet_readl(HPET_Tn_CFG(hc->num)); cfg |= HPET_TN_ENABLE | HPET_TN_FSB; hpet_writel(cfg, HPET_Tn_CFG(hc->num)); } static void hpet_msi_mask(struct irq_data *data) { struct hpet_channel *hc = irq_data_get_irq_handler_data(data); unsigned int cfg; cfg = hpet_readl(HPET_Tn_CFG(hc->num)); cfg &= ~(HPET_TN_ENABLE | HPET_TN_FSB); hpet_writel(cfg, HPET_Tn_CFG(hc->num)); } static void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg) { hpet_writel(msg->data, HPET_Tn_ROUTE(hc->num)); hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hc->num) + 4); } static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg) { hpet_msi_write(irq_data_get_irq_handler_data(data), msg); } static struct irq_chip hpet_msi_controller __ro_after_init = { .name = "HPET-MSI", .irq_unmask = hpet_msi_unmask, .irq_mask = hpet_msi_mask, .irq_ack = irq_chip_ack_parent, .irq_set_affinity = msi_domain_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_write_msi_msg = hpet_msi_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_AFFINITY_PRE_STARTUP, }; static int hpet_msi_init(struct irq_domain *domain, struct msi_domain_info *info, unsigned int virq, irq_hw_number_t hwirq, msi_alloc_info_t *arg) { irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); irq_domain_set_info(domain, virq, arg->hwirq, info->chip, NULL, handle_edge_irq, arg->data, "edge"); return 0; } static void hpet_msi_free(struct irq_domain *domain, struct msi_domain_info *info, unsigned int virq) { irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); } static struct msi_domain_ops hpet_msi_domain_ops = { .msi_init = hpet_msi_init, .msi_free = hpet_msi_free, }; static struct msi_domain_info hpet_msi_domain_info = { .ops = &hpet_msi_domain_ops, .chip = &hpet_msi_controller, .flags = MSI_FLAG_USE_DEF_DOM_OPS, }; static struct irq_domain *hpet_create_irq_domain(int hpet_id) { struct msi_domain_info *domain_info; struct irq_domain *parent, *d; struct fwnode_handle *fn; struct irq_fwspec fwspec; if (x86_vector_domain == NULL) return NULL; domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL); if (!domain_info) return NULL; *domain_info = hpet_msi_domain_info; domain_info->data = (void *)(long)hpet_id; fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name, hpet_id); if (!fn) { kfree(domain_info); return NULL; } fwspec.fwnode = fn; fwspec.param_count = 1; fwspec.param[0] = hpet_id; parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY); if (!parent) { irq_domain_free_fwnode(fn); kfree(domain_info); return NULL; } if (parent != x86_vector_domain) hpet_msi_controller.name = "IR-HPET-MSI"; d = msi_create_irq_domain(fn, domain_info, parent); if (!d) { irq_domain_free_fwnode(fn); kfree(domain_info); } return d; } static inline int hpet_dev_id(struct irq_domain *domain) { struct msi_domain_info *info = msi_get_domain_info(domain); return (int)(long)info->data; } static int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc, int dev_num) { struct irq_alloc_info info; init_irq_alloc_info(&info, NULL); info.type = X86_IRQ_ALLOC_TYPE_HPET; info.data = hc; info.devid = hpet_dev_id(domain); info.hwirq = dev_num; return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info); } static int hpet_clkevt_msi_resume(struct clock_event_device *evt) { struct hpet_channel *hc = clockevent_to_channel(evt); struct irq_data *data = irq_get_irq_data(hc->irq); struct msi_msg msg; /* Restore the MSI msg and unmask the interrupt */ irq_chip_compose_msi_msg(data, &msg); hpet_msi_write(hc, &msg); hpet_msi_unmask(data); return 0; } static irqreturn_t hpet_msi_interrupt_handler(int irq, void *data) { struct hpet_channel *hc = data; struct clock_event_device *evt = &hc->evt; if (!evt->event_handler) { pr_info("Spurious interrupt HPET channel %d\n", hc->num); return IRQ_HANDLED; } evt->event_handler(evt); return IRQ_HANDLED; } static int hpet_setup_msi_irq(struct hpet_channel *hc) { if (request_irq(hc->irq, hpet_msi_interrupt_handler, IRQF_TIMER | IRQF_NOBALANCING, hc->name, hc)) return -1; disable_irq(hc->irq); irq_set_affinity(hc->irq, cpumask_of(hc->cpu)); enable_irq(hc->irq); pr_debug("%s irq %u for MSI\n", hc->name, hc->irq); return 0; } /* Invoked from the hotplug callback on @cpu */ static void init_one_hpet_msi_clockevent(struct hpet_channel *hc, int cpu) { struct clock_event_device *evt = &hc->evt; hc->cpu = cpu; per_cpu(cpu_hpet_channel, cpu) = hc; hpet_setup_msi_irq(hc); hpet_init_clockevent(hc, 110); evt->tick_resume = hpet_clkevt_msi_resume; clockevents_config_and_register(evt, hpet_freq, HPET_MIN_PROG_DELTA, 0x7FFFFFFF); } static struct hpet_channel *hpet_get_unused_clockevent(void) { int i; for (i = 0; i < hpet_base.nr_channels; i++) { struct hpet_channel *hc = hpet_base.channels + i; if (hc->mode != HPET_MODE_CLOCKEVT || hc->in_use) continue; hc->in_use = 1; return hc; } return NULL; } static int hpet_cpuhp_online(unsigned int cpu) { struct hpet_channel *hc = hpet_get_unused_clockevent(); if (hc) init_one_hpet_msi_clockevent(hc, cpu); return 0; } static int hpet_cpuhp_dead(unsigned int cpu) { struct hpet_channel *hc = per_cpu(cpu_hpet_channel, cpu); if (!hc) return 0; free_irq(hc->irq, hc); hc->in_use = 0; per_cpu(cpu_hpet_channel, cpu) = NULL; return 0; } static void __init hpet_select_clockevents(void) { unsigned int i; hpet_base.nr_clockevents = 0; /* No point if MSI is disabled or CPU has an Always Running APIC Timer */ if (hpet_msi_disable || boot_cpu_has(X86_FEATURE_ARAT)) return; hpet_print_config(); hpet_domain = hpet_create_irq_domain(hpet_blockid); if (!hpet_domain) return; for (i = 0; i < hpet_base.nr_channels; i++) { struct hpet_channel *hc = hpet_base.channels + i; int irq; if (hc->mode != HPET_MODE_UNUSED) continue; /* Only consider HPET channel with MSI support */ if (!(hc->boot_cfg & HPET_TN_FSB_CAP)) continue; sprintf(hc->name, "hpet%d", i); irq = hpet_assign_irq(hpet_domain, hc, hc->num); if (irq <= 0) continue; hc->irq = irq; hc->mode = HPET_MODE_CLOCKEVT; if (++hpet_base.nr_clockevents == num_possible_cpus()) break; } pr_info("%d channels of %d reserved for per-cpu timers\n", hpet_base.nr_channels, hpet_base.nr_clockevents); } #else static inline void hpet_select_clockevents(void) { } #define hpet_cpuhp_online NULL #define hpet_cpuhp_dead NULL #endif /* * Clock source related code */ #if defined(CONFIG_SMP) && defined(CONFIG_64BIT) /* * Reading the HPET counter is a very slow operation. If a large number of * CPUs are trying to access the HPET counter simultaneously, it can cause * massive delays and slow down system performance dramatically. This may * happen when HPET is the default clock source instead of TSC. For a * really large system with hundreds of CPUs, the slowdown may be so * severe, that it can actually crash the system because of a NMI watchdog * soft lockup, for example. * * If multiple CPUs are trying to access the HPET counter at the same time, * we don't actually need to read the counter multiple times. Instead, the * other CPUs can use the counter value read by the first CPU in the group. * * This special feature is only enabled on x86-64 systems. It is unlikely * that 32-bit x86 systems will have enough CPUs to require this feature * with its associated locking overhead. We also need 64-bit atomic read. * * The lock and the HPET value are stored together and can be read in a * single atomic 64-bit read. It is explicitly assumed that arch_spinlock_t * is 32 bits in size. */ union hpet_lock { struct { arch_spinlock_t lock; u32 value; }; u64 lockval; }; static union hpet_lock hpet __cacheline_aligned = { { .lock = __ARCH_SPIN_LOCK_UNLOCKED, }, }; static u64 read_hpet(struct clocksource *cs) { unsigned long flags; union hpet_lock old, new; BUILD_BUG_ON(sizeof(union hpet_lock) != 8); /* * Read HPET directly if in NMI. */ if (in_nmi()) return (u64)hpet_readl(HPET_COUNTER); /* * Read the current state of the lock and HPET value atomically. */ old.lockval = READ_ONCE(hpet.lockval); if (arch_spin_is_locked(&old.lock)) goto contended; local_irq_save(flags); if (arch_spin_trylock(&hpet.lock)) { new.value = hpet_readl(HPET_COUNTER); /* * Use WRITE_ONCE() to prevent store tearing. */ WRITE_ONCE(hpet.value, new.value); arch_spin_unlock(&hpet.lock); local_irq_restore(flags); return (u64)new.value; } local_irq_restore(flags); contended: /* * Contended case * -------------- * Wait until the HPET value change or the lock is free to indicate * its value is up-to-date. * * It is possible that old.value has already contained the latest * HPET value while the lock holder was in the process of releasing * the lock. Checking for lock state change will enable us to return * the value immediately instead of waiting for the next HPET reader * to come along. */ do { cpu_relax(); new.lockval = READ_ONCE(hpet.lockval); } while ((new.value == old.value) && arch_spin_is_locked(&new.lock)); return (u64)new.value; } #else /* * For UP or 32-bit. */ static u64 read_hpet(struct clocksource *cs) { return (u64)hpet_readl(HPET_COUNTER); } #endif static struct clocksource clocksource_hpet = { .name = "hpet", .rating = 250, .read = read_hpet, .mask = HPET_MASK, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .resume = hpet_resume_counter, }; /* * AMD SB700 based systems with spread spectrum enabled use a SMM based * HPET emulation to provide proper frequency setting. * * On such systems the SMM code is initialized with the first HPET register * access and takes some time to complete. During this time the config * register reads 0xffffffff. We check for max 1000 loops whether the * config register reads a non-0xffffffff value to make sure that the * HPET is up and running before we proceed any further. * * A counting loop is safe, as the HPET access takes thousands of CPU cycles. * * On non-SB700 based machines this check is only done once and has no * side effects. */ static bool __init hpet_cfg_working(void) { int i; for (i = 0; i < 1000; i++) { if (hpet_readl(HPET_CFG) != 0xFFFFFFFF) return true; } pr_warn("Config register invalid. Disabling HPET\n"); return false; } static bool __init hpet_counting(void) { u64 start, now, t1; hpet_restart_counter(); t1 = hpet_readl(HPET_COUNTER); start = rdtsc(); /* * We don't know the TSC frequency yet, but waiting for * 200000 TSC cycles is safe: * 4 GHz == 50us * 1 GHz == 200us */ do { if (t1 != hpet_readl(HPET_COUNTER)) return true; now = rdtsc(); } while ((now - start) < 200000UL); pr_warn("Counter not counting. HPET disabled\n"); return false; } static bool __init mwait_pc10_supported(void) { unsigned int eax, ebx, ecx, mwait_substates; if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return false; if (!cpu_feature_enabled(X86_FEATURE_MWAIT)) return false; if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) return false; cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates); return (ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) && (ecx & CPUID5_ECX_INTERRUPT_BREAK) && (mwait_substates & (0xF << 28)); } /* * Check whether the system supports PC10. If so force disable HPET as that * stops counting in PC10. This check is overbroad as it does not take any * of the following into account: * * - ACPI tables * - Enablement of intel_idle * - Command line arguments which limit intel_idle C-state support * * That's perfectly fine. HPET is a piece of hardware designed by committee * and the only reasons why it is still in use on modern systems is the * fact that it is impossible to reliably query TSC and CPU frequency via * CPUID or firmware. * * If HPET is functional it is useful for calibrating TSC, but this can be * done via PMTIMER as well which seems to be the last remaining timer on * X86/INTEL platforms that has not been completely wreckaged by feature * creep. * * In theory HPET support should be removed altogether, but there are older * systems out there which depend on it because TSC and APIC timer are * dysfunctional in deeper C-states. * * It's only 20 years now that hardware people have been asked to provide * reliable and discoverable facilities which can be used for timekeeping * and per CPU timer interrupts. * * The probability that this problem is going to be solved in the * foreseeable future is close to zero, so the kernel has to be cluttered * with heuristics to keep up with the ever growing amount of hardware and * firmware trainwrecks. Hopefully some day hardware people will understand * that the approach of "This can be fixed in software" is not sustainable. * Hope dies last... */ static bool __init hpet_is_pc10_damaged(void) { unsigned long long pcfg; /* Check whether PC10 substates are supported */ if (!mwait_pc10_supported()) return false; /* Check whether PC10 is enabled in PKG C-state limit */ rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, pcfg); if ((pcfg & 0xF) < 8) return false; if (hpet_force_user) { pr_warn("HPET force enabled via command line, but dysfunctional in PC10.\n"); return false; } pr_info("HPET dysfunctional in PC10. Force disabled.\n"); boot_hpet_disable = true; return true; } /** * hpet_enable - Try to setup the HPET timer. Returns 1 on success. */ int __init hpet_enable(void) { u32 hpet_period, cfg, id, irq; unsigned int i, channels; struct hpet_channel *hc; u64 freq; if (!is_hpet_capable()) return 0; if (hpet_is_pc10_damaged()) return 0; hpet_set_mapping(); if (!hpet_virt_address) return 0; /* Validate that the config register is working */ if (!hpet_cfg_working()) goto out_nohpet; /* * Read the period and check for a sane value: */ hpet_period = hpet_readl(HPET_PERIOD); if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD) goto out_nohpet; /* The period is a femtoseconds value. Convert it to a frequency. */ freq = FSEC_PER_SEC; do_div(freq, hpet_period); hpet_freq = freq; /* * Read the HPET ID register to retrieve the IRQ routing * information and the number of channels */ id = hpet_readl(HPET_ID); hpet_print_config(); /* This is the HPET channel number which is zero based */ channels = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; /* * The legacy routing mode needs at least two channels, tick timer * and the rtc emulation channel. */ if (IS_ENABLED(CONFIG_HPET_EMULATE_RTC) && channels < 2) goto out_nohpet; hc = kcalloc(channels, sizeof(*hc), GFP_KERNEL); if (!hc) { pr_warn("Disabling HPET.\n"); goto out_nohpet; } hpet_base.channels = hc; hpet_base.nr_channels = channels; /* Read, store and sanitize the global configuration */ cfg = hpet_readl(HPET_CFG); hpet_base.boot_cfg = cfg; cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); hpet_writel(cfg, HPET_CFG); if (cfg) pr_warn("Global config: Unknown bits %#x\n", cfg); /* Read, store and sanitize the per channel configuration */ for (i = 0; i < channels; i++, hc++) { hc->num = i; cfg = hpet_readl(HPET_Tn_CFG(i)); hc->boot_cfg = cfg; irq = (cfg & Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT; hc->irq = irq; cfg &= ~(HPET_TN_ENABLE | HPET_TN_LEVEL | HPET_TN_FSB); hpet_writel(cfg, HPET_Tn_CFG(i)); cfg &= ~(HPET_TN_PERIODIC | HPET_TN_PERIODIC_CAP | HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE | HPET_TN_FSB | HPET_TN_FSB_CAP); if (cfg) pr_warn("Channel #%u config: Unknown bits %#x\n", i, cfg); } hpet_print_config(); /* * Validate that the counter is counting. This needs to be done * after sanitizing the config registers to properly deal with * force enabled HPETs. */ if (!hpet_counting()) goto out_nohpet; if (tsc_clocksource_watchdog_disabled()) clocksource_hpet.flags |= CLOCK_SOURCE_MUST_VERIFY; clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); if (id & HPET_ID_LEGSUP) { hpet_legacy_clockevent_register(&hpet_base.channels[0]); hpet_base.channels[0].mode = HPET_MODE_LEGACY; if (IS_ENABLED(CONFIG_HPET_EMULATE_RTC)) hpet_base.channels[1].mode = HPET_MODE_LEGACY; return 1; } return 0; out_nohpet: kfree(hpet_base.channels); hpet_base.channels = NULL; hpet_base.nr_channels = 0; hpet_clear_mapping(); hpet_address = 0; return 0; } /* * The late initialization runs after the PCI quirks have been invoked * which might have detected a system on which the HPET can be enforced. * * Also, the MSI machinery is not working yet when the HPET is initialized * early. * * If the HPET is enabled, then: * * 1) Reserve one channel for /dev/hpet if CONFIG_HPET=y * 2) Reserve up to num_possible_cpus() channels as per CPU clockevents * 3) Setup /dev/hpet if CONFIG_HPET=y * 4) Register hotplug callbacks when clockevents are available */ static __init int hpet_late_init(void) { int ret; if (!hpet_address) { if (!force_hpet_address) return -ENODEV; hpet_address = force_hpet_address; hpet_enable(); } if (!hpet_virt_address) return -ENODEV; hpet_select_device_channel(); hpet_select_clockevents(); hpet_reserve_platform_timers(); hpet_print_config(); if (!hpet_base.nr_clockevents) return 0; ret = cpuhp_setup_state(CPUHP_AP_X86_HPET_ONLINE, "x86/hpet:online", hpet_cpuhp_online, NULL); if (ret) return ret; ret = cpuhp_setup_state(CPUHP_X86_HPET_DEAD, "x86/hpet:dead", NULL, hpet_cpuhp_dead); if (ret) goto err_cpuhp; return 0; err_cpuhp: cpuhp_remove_state(CPUHP_AP_X86_HPET_ONLINE); return ret; } fs_initcall(hpet_late_init); void hpet_disable(void) { unsigned int i; u32 cfg; if (!is_hpet_capable() || !hpet_virt_address) return; /* Restore boot configuration with the enable bit cleared */ cfg = hpet_base.boot_cfg; cfg &= ~HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); /* Restore the channel boot configuration */ for (i = 0; i < hpet_base.nr_channels; i++) hpet_writel(hpet_base.channels[i].boot_cfg, HPET_Tn_CFG(i)); /* If the HPET was enabled at boot time, reenable it */ if (hpet_base.boot_cfg & HPET_CFG_ENABLE) hpet_writel(hpet_base.boot_cfg, HPET_CFG); } #ifdef CONFIG_HPET_EMULATE_RTC /* * HPET in LegacyReplacement mode eats up the RTC interrupt line. When HPET * is enabled, we support RTC interrupt functionality in software. * * RTC has 3 kinds of interrupts: * * 1) Update Interrupt - generate an interrupt, every second, when the * RTC clock is updated * 2) Alarm Interrupt - generate an interrupt at a specific time of day * 3) Periodic Interrupt - generate periodic interrupt, with frequencies * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all frequencies in powers of 2) * * (1) and (2) above are implemented using polling at a frequency of 64 Hz: * DEFAULT_RTC_INT_FREQ. * * The exact frequency is a tradeoff between accuracy and interrupt overhead. * * For (3), we use interrupts at 64 Hz, or the user specified periodic frequency, * if it's higher. */ #include <linux/mc146818rtc.h> #include <linux/rtc.h> #define DEFAULT_RTC_INT_FREQ 64 #define DEFAULT_RTC_SHIFT 6 #define RTC_NUM_INTS 1 static unsigned long hpet_rtc_flags; static int hpet_prev_update_sec; static struct rtc_time hpet_alarm_time; static unsigned long hpet_pie_count; static u32 hpet_t1_cmp; static u32 hpet_default_delta; static u32 hpet_pie_delta; static unsigned long hpet_pie_limit; static rtc_irq_handler irq_handler; /* * Check that the HPET counter c1 is ahead of c2 */ static inline int hpet_cnt_ahead(u32 c1, u32 c2) { return (s32)(c2 - c1) < 0; } /* * Registers a IRQ handler. */ int hpet_register_irq_handler(rtc_irq_handler handler) { if (!is_hpet_enabled()) return -ENODEV; if (irq_handler) return -EBUSY; irq_handler = handler; return 0; } EXPORT_SYMBOL_GPL(hpet_register_irq_handler); /* * Deregisters the IRQ handler registered with hpet_register_irq_handler() * and does cleanup. */ void hpet_unregister_irq_handler(rtc_irq_handler handler) { if (!is_hpet_enabled()) return; irq_handler = NULL; hpet_rtc_flags = 0; } EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler); /* * Channel 1 for RTC emulation. We use one shot mode, as periodic mode * is not supported by all HPET implementations for channel 1. * * hpet_rtc_timer_init() is called when the rtc is initialized. */ int hpet_rtc_timer_init(void) { unsigned int cfg, cnt, delta; unsigned long flags; if (!is_hpet_enabled()) return 0; if (!hpet_default_delta) { struct clock_event_device *evt = &hpet_base.channels[0].evt; uint64_t clc; clc = (uint64_t) evt->mult * NSEC_PER_SEC; clc >>= evt->shift + DEFAULT_RTC_SHIFT; hpet_default_delta = clc; } if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) delta = hpet_default_delta; else delta = hpet_pie_delta; local_irq_save(flags); cnt = delta + hpet_readl(HPET_COUNTER); hpet_writel(cnt, HPET_T1_CMP); hpet_t1_cmp = cnt; cfg = hpet_readl(HPET_T1_CFG); cfg &= ~HPET_TN_PERIODIC; cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; hpet_writel(cfg, HPET_T1_CFG); local_irq_restore(flags); return 1; } EXPORT_SYMBOL_GPL(hpet_rtc_timer_init); static void hpet_disable_rtc_channel(void) { u32 cfg = hpet_readl(HPET_T1_CFG); cfg &= ~HPET_TN_ENABLE; hpet_writel(cfg, HPET_T1_CFG); } /* * The functions below are called from rtc driver. * Return 0 if HPET is not being used. * Otherwise do the necessary changes and return 1. */ int hpet_mask_rtc_irq_bit(unsigned long bit_mask) { if (!is_hpet_enabled()) return 0; hpet_rtc_flags &= ~bit_mask; if (unlikely(!hpet_rtc_flags)) hpet_disable_rtc_channel(); return 1; } EXPORT_SYMBOL_GPL(hpet_mask_rtc_irq_bit); int hpet_set_rtc_irq_bit(unsigned long bit_mask) { unsigned long oldbits = hpet_rtc_flags; if (!is_hpet_enabled()) return 0; hpet_rtc_flags |= bit_mask; if ((bit_mask & RTC_UIE) && !(oldbits & RTC_UIE)) hpet_prev_update_sec = -1; if (!oldbits) hpet_rtc_timer_init(); return 1; } EXPORT_SYMBOL_GPL(hpet_set_rtc_irq_bit); int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) { if (!is_hpet_enabled()) return 0; hpet_alarm_time.tm_hour = hrs; hpet_alarm_time.tm_min = min; hpet_alarm_time.tm_sec = sec; return 1; } EXPORT_SYMBOL_GPL(hpet_set_alarm_time); int hpet_set_periodic_freq(unsigned long freq) { uint64_t clc; if (!is_hpet_enabled()) return 0; if (freq <= DEFAULT_RTC_INT_FREQ) { hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq; } else { struct clock_event_device *evt = &hpet_base.channels[0].evt; clc = (uint64_t) evt->mult * NSEC_PER_SEC; do_div(clc, freq); clc >>= evt->shift; hpet_pie_delta = clc; hpet_pie_limit = 0; } return 1; } EXPORT_SYMBOL_GPL(hpet_set_periodic_freq); int hpet_rtc_dropped_irq(void) { return is_hpet_enabled(); } EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq); static void hpet_rtc_timer_reinit(void) { unsigned int delta; int lost_ints = -1; if (unlikely(!hpet_rtc_flags)) hpet_disable_rtc_channel(); if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) delta = hpet_default_delta; else delta = hpet_pie_delta; /* * Increment the comparator value until we are ahead of the * current count. */ do { hpet_t1_cmp += delta; hpet_writel(hpet_t1_cmp, HPET_T1_CMP); lost_ints++; } while (!hpet_cnt_ahead(hpet_t1_cmp, hpet_readl(HPET_COUNTER))); if (lost_ints) { if (hpet_rtc_flags & RTC_PIE) hpet_pie_count += lost_ints; if (printk_ratelimit()) pr_warn("Lost %d RTC interrupts\n", lost_ints); } } irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) { struct rtc_time curr_time; unsigned long rtc_int_flag = 0; hpet_rtc_timer_reinit(); memset(&curr_time, 0, sizeof(struct rtc_time)); if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) { if (unlikely(mc146818_get_time(&curr_time) < 0)) { pr_err_ratelimited("unable to read current time from RTC\n"); return IRQ_HANDLED; } } if (hpet_rtc_flags & RTC_UIE && curr_time.tm_sec != hpet_prev_update_sec) { if (hpet_prev_update_sec >= 0) rtc_int_flag = RTC_UF; hpet_prev_update_sec = curr_time.tm_sec; } if (hpet_rtc_flags & RTC_PIE && ++hpet_pie_count >= hpet_pie_limit) { rtc_int_flag |= RTC_PF; hpet_pie_count = 0; } if (hpet_rtc_flags & RTC_AIE && (curr_time.tm_sec == hpet_alarm_time.tm_sec) && (curr_time.tm_min == hpet_alarm_time.tm_min) && (curr_time.tm_hour == hpet_alarm_time.tm_hour)) rtc_int_flag |= RTC_AF; if (rtc_int_flag) { rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); if (irq_handler) irq_handler(rtc_int_flag, dev_id); } return IRQ_HANDLED; } EXPORT_SYMBOL_GPL(hpet_rtc_interrupt); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Definitions for the 'struct skb_array' datastructure. * * Author: * Michael S. Tsirkin <mst@redhat.com> * * Copyright (C) 2016 Red Hat, Inc. * * Limited-size FIFO of skbs. Can be used more or less whenever * sk_buff_head can be used, except you need to know the queue size in * advance. * Implemented as a type-safe wrapper around ptr_ring. */ #ifndef _LINUX_SKB_ARRAY_H #define _LINUX_SKB_ARRAY_H 1 #ifdef __KERNEL__ #include <linux/ptr_ring.h> #include <linux/skbuff.h> #include <linux/if_vlan.h> #endif struct skb_array { struct ptr_ring ring; }; /* Might be slightly faster than skb_array_full below, but callers invoking * this in a loop must use a compiler barrier, for example cpu_relax(). */ static inline bool __skb_array_full(struct skb_array *a) { return __ptr_ring_full(&a->ring); } static inline bool skb_array_full(struct skb_array *a) { return ptr_ring_full(&a->ring); } static inline int skb_array_produce(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce(&a->ring, skb); } static inline int skb_array_produce_irq(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce_irq(&a->ring, skb); } static inline int skb_array_produce_bh(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce_bh(&a->ring, skb); } static inline int skb_array_produce_any(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce_any(&a->ring, skb); } /* Might be slightly faster than skb_array_empty below, but only safe if the * array is never resized. Also, callers invoking this in a loop must take care * to use a compiler barrier, for example cpu_relax(). */ static inline bool __skb_array_empty(struct skb_array *a) { return __ptr_ring_empty(&a->ring); } static inline struct sk_buff *__skb_array_peek(struct skb_array *a) { return __ptr_ring_peek(&a->ring); } static inline bool skb_array_empty(struct skb_array *a) { return ptr_ring_empty(&a->ring); } static inline bool skb_array_empty_bh(struct skb_array *a) { return ptr_ring_empty_bh(&a->ring); } static inline bool skb_array_empty_irq(struct skb_array *a) { return ptr_ring_empty_irq(&a->ring); } static inline bool skb_array_empty_any(struct skb_array *a) { return ptr_ring_empty_any(&a->ring); } static inline struct sk_buff *__skb_array_consume(struct skb_array *a) { return __ptr_ring_consume(&a->ring); } static inline struct sk_buff *skb_array_consume(struct skb_array *a) { return ptr_ring_consume(&a->ring); } static inline int skb_array_consume_batched(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched(&a->ring, (void **)array, n); } static inline struct sk_buff *skb_array_consume_irq(struct skb_array *a) { return ptr_ring_consume_irq(&a->ring); } static inline int skb_array_consume_batched_irq(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched_irq(&a->ring, (void **)array, n); } static inline struct sk_buff *skb_array_consume_any(struct skb_array *a) { return ptr_ring_consume_any(&a->ring); } static inline int skb_array_consume_batched_any(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched_any(&a->ring, (void **)array, n); } static inline struct sk_buff *skb_array_consume_bh(struct skb_array *a) { return ptr_ring_consume_bh(&a->ring); } static inline int skb_array_consume_batched_bh(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched_bh(&a->ring, (void **)array, n); } static inline int __skb_array_len_with_tag(struct sk_buff *skb) { if (likely(skb)) { int len = skb->len; if (skb_vlan_tag_present(skb)) len += VLAN_HLEN; return len; } else { return 0; } } static inline int skb_array_peek_len(struct skb_array *a) { return PTR_RING_PEEK_CALL(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_peek_len_irq(struct skb_array *a) { return PTR_RING_PEEK_CALL_IRQ(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_peek_len_bh(struct skb_array *a) { return PTR_RING_PEEK_CALL_BH(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_peek_len_any(struct skb_array *a) { return PTR_RING_PEEK_CALL_ANY(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_init(struct skb_array *a, int size, gfp_t gfp) { return ptr_ring_init(&a->ring, size, gfp); } static void __skb_array_destroy_skb(void *ptr) { kfree_skb(ptr); } static inline void skb_array_unconsume(struct skb_array *a, struct sk_buff **skbs, int n) { ptr_ring_unconsume(&a->ring, (void **)skbs, n, __skb_array_destroy_skb); } static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp) { return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb); } static inline int skb_array_resize_multiple(struct skb_array **rings, int nrings, unsigned int size, gfp_t gfp) { BUILD_BUG_ON(offsetof(struct skb_array, ring)); return ptr_ring_resize_multiple((struct ptr_ring **)rings, nrings, size, gfp, __skb_array_destroy_skb); } static inline void skb_array_cleanup(struct skb_array *a) { ptr_ring_cleanup(&a->ring, __skb_array_destroy_skb); } #endif /* _LINUX_SKB_ARRAY_H */
2 2 1 10 3 1 1 5 2 3 1 1 1 1 6 1 1 2 1 1 2 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 // SPDX-License-Identifier: GPL-2.0 /* XSKMAP used for AF_XDP sockets * Copyright(c) 2018 Intel Corporation. */ #include <linux/bpf.h> #include <linux/filter.h> #include <net/xdp_sock.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/btf_ids.h> #include "xsk.h" static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, struct xdp_sock __rcu **map_entry) { struct xsk_map_node *node; node = bpf_map_kzalloc(&map->map, sizeof(*node), GFP_ATOMIC | __GFP_NOWARN); if (!node) return ERR_PTR(-ENOMEM); bpf_map_inc(&map->map); atomic_inc(&map->count); node->map = map; node->map_entry = map_entry; return node; } static void xsk_map_node_free(struct xsk_map_node *node) { struct xsk_map *map = node->map; bpf_map_put(&node->map->map); kfree(node); atomic_dec(&map->count); } static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node) { spin_lock_bh(&xs->map_list_lock); list_add_tail(&node->node, &xs->map_list); spin_unlock_bh(&xs->map_list_lock); } static void xsk_map_sock_delete(struct xdp_sock *xs, struct xdp_sock __rcu **map_entry) { struct xsk_map_node *n, *tmp; spin_lock_bh(&xs->map_list_lock); list_for_each_entry_safe(n, tmp, &xs->map_list, node) { if (map_entry == n->map_entry) { list_del(&n->node); xsk_map_node_free(n); } } spin_unlock_bh(&xs->map_list_lock); } static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) { struct xsk_map *m; int numa_node; u64 size; if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size != 4 || attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)) return ERR_PTR(-EINVAL); numa_node = bpf_map_attr_numa_node(attr); size = struct_size(m, xsk_map, attr->max_entries); m = bpf_map_area_alloc(size, numa_node); if (!m) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&m->map, attr); spin_lock_init(&m->lock); return &m->map; } static u64 xsk_map_mem_usage(const struct bpf_map *map) { struct xsk_map *m = container_of(map, struct xsk_map, map); return struct_size(m, xsk_map, map->max_entries) + (u64)atomic_read(&m->count) * sizeof(struct xsk_map_node); } static void xsk_map_free(struct bpf_map *map) { struct xsk_map *m = container_of(map, struct xsk_map, map); synchronize_net(); bpf_map_area_free(m); } static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct xsk_map *m = container_of(map, struct xsk_map, map); u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = next_key; if (index >= m->map.max_entries) { *next = 0; return 0; } if (index == m->map.max_entries - 1) return -ENOENT; *next = index + 1; return 0; } static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2; struct bpf_insn *insn = insn_buf; *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *))); *insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map)); *insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0); *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); *insn++ = BPF_MOV64_IMM(ret, 0); return insn - insn_buf; } /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or * by local_bh_disable() (from XDP calls inside NAPI). The * rcu_read_lock_bh_held() below makes lockdep accept both. */ static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) { struct xsk_map *m = container_of(map, struct xsk_map, map); if (key >= map->max_entries) return NULL; return rcu_dereference_check(m->xsk_map[key], rcu_read_lock_bh_held()); } static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) { return __xsk_map_lookup_elem(map, *(u32 *)key); } static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key) { return ERR_PTR(-EOPNOTSUPP); } static long xsk_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct xsk_map *m = container_of(map, struct xsk_map, map); struct xdp_sock __rcu **map_entry; struct xdp_sock *xs, *old_xs; u32 i = *(u32 *)key, fd = *(u32 *)value; struct xsk_map_node *node; struct socket *sock; int err; if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; if (unlikely(i >= m->map.max_entries)) return -E2BIG; sock = sockfd_lookup(fd, &err); if (!sock) return err; if (sock->sk->sk_family != PF_XDP) { sockfd_put(sock); return -EOPNOTSUPP; } xs = (struct xdp_sock *)sock->sk; map_entry = &m->xsk_map[i]; node = xsk_map_node_alloc(m, map_entry); if (IS_ERR(node)) { sockfd_put(sock); return PTR_ERR(node); } spin_lock_bh(&m->lock); old_xs = rcu_dereference_protected(*map_entry, lockdep_is_held(&m->lock)); if (old_xs == xs) { err = 0; goto out; } else if (old_xs && map_flags == BPF_NOEXIST) { err = -EEXIST; goto out; } else if (!old_xs && map_flags == BPF_EXIST) { err = -ENOENT; goto out; } xsk_map_sock_add(xs, node); rcu_assign_pointer(*map_entry, xs); if (old_xs) xsk_map_sock_delete(old_xs, map_entry); spin_unlock_bh(&m->lock); sockfd_put(sock); return 0; out: spin_unlock_bh(&m->lock); sockfd_put(sock); xsk_map_node_free(node); return err; } static long xsk_map_delete_elem(struct bpf_map *map, void *key) { struct xsk_map *m = container_of(map, struct xsk_map, map); struct xdp_sock __rcu **map_entry; struct xdp_sock *old_xs; int k = *(u32 *)key; if (k >= map->max_entries) return -EINVAL; spin_lock_bh(&m->lock); map_entry = &m->xsk_map[k]; old_xs = unrcu_pointer(xchg(map_entry, NULL)); if (old_xs) xsk_map_sock_delete(old_xs, map_entry); spin_unlock_bh(&m->lock); return 0; } static long xsk_map_redirect(struct bpf_map *map, u64 index, u64 flags) { return __bpf_xdp_redirect_map(map, index, flags, 0, __xsk_map_lookup_elem); } void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, struct xdp_sock __rcu **map_entry) { spin_lock_bh(&map->lock); if (rcu_access_pointer(*map_entry) == xs) { rcu_assign_pointer(*map_entry, NULL); xsk_map_sock_delete(xs, map_entry); } spin_unlock_bh(&map->lock); } static bool xsk_map_meta_equal(const struct bpf_map *meta0, const struct bpf_map *meta1) { return meta0->max_entries == meta1->max_entries && bpf_map_meta_equal(meta0, meta1); } BTF_ID_LIST_SINGLE(xsk_map_btf_ids, struct, xsk_map) const struct bpf_map_ops xsk_map_ops = { .map_meta_equal = xsk_map_meta_equal, .map_alloc = xsk_map_alloc, .map_free = xsk_map_free, .map_get_next_key = xsk_map_get_next_key, .map_lookup_elem = xsk_map_lookup_elem, .map_gen_lookup = xsk_map_gen_lookup, .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only, .map_update_elem = xsk_map_update_elem, .map_delete_elem = xsk_map_delete_elem, .map_check_btf = map_check_no_btf, .map_mem_usage = xsk_map_mem_usage, .map_btf_id = &xsk_map_btf_ids[0], .map_redirect = xsk_map_redirect, };
3097 3095 3016 138 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BLK_CGROUP_PRIVATE_H #define _BLK_CGROUP_PRIVATE_H /* * block cgroup private header * * Based on ideas and code from CFQ, CFS and BFQ: * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> * * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> * Paolo Valente <paolo.valente@unimore.it> * * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> * Nauman Rafique <nauman@google.com> */ #include <linux/blk-cgroup.h> #include <linux/cgroup.h> #include <linux/kthread.h> #include <linux/blk-mq.h> #include <linux/llist.h> struct blkcg_gq; struct blkg_policy_data; /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ #define BLKG_STAT_CPU_BATCH (INT_MAX / 2) #ifdef CONFIG_BLK_CGROUP enum blkg_iostat_type { BLKG_IOSTAT_READ, BLKG_IOSTAT_WRITE, BLKG_IOSTAT_DISCARD, BLKG_IOSTAT_NR, }; struct blkg_iostat { u64 bytes[BLKG_IOSTAT_NR]; u64 ios[BLKG_IOSTAT_NR]; }; struct blkg_iostat_set { struct u64_stats_sync sync; struct blkcg_gq *blkg; struct llist_node lnode; int lqueued; /* queued in llist */ struct blkg_iostat cur; struct blkg_iostat last; }; /* association between a blk cgroup and a request queue */ struct blkcg_gq { /* Pointer to the associated request_queue */ struct request_queue *q; struct list_head q_node; struct hlist_node blkcg_node; struct blkcg *blkcg; /* all non-root blkcg_gq's are guaranteed to have access to parent */ struct blkcg_gq *parent; /* reference count */ struct percpu_ref refcnt; /* is this blkg online? protected by both blkcg and q locks */ bool online; struct blkg_iostat_set __percpu *iostat_cpu; struct blkg_iostat_set iostat; struct blkg_policy_data *pd[BLKCG_MAX_POLS]; #ifdef CONFIG_BLK_CGROUP_PUNT_BIO spinlock_t async_bio_lock; struct bio_list async_bios; #endif union { struct work_struct async_bio_work; struct work_struct free_work; }; atomic_t use_delay; atomic64_t delay_nsec; atomic64_t delay_start; u64 last_delay; int last_use; struct rcu_head rcu_head; }; struct blkcg { struct cgroup_subsys_state css; spinlock_t lock; refcount_t online_pin; struct radix_tree_root blkg_tree; struct blkcg_gq __rcu *blkg_hint; struct hlist_head blkg_list; struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; struct list_head all_blkcgs_node; /* * List of updated percpu blkg_iostat_set's since the last flush. */ struct llist_head __percpu *lhead; #ifdef CONFIG_BLK_CGROUP_FC_APPID char fc_app_id[FC_APPID_LEN]; #endif #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; #endif }; static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) { return css ? container_of(css, struct blkcg, css) : NULL; } /* * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a * request_queue (q). This is used by blkcg policies which need to track * information per blkcg - q pair. * * There can be multiple active blkcg policies and each blkg:policy pair is * represented by a blkg_policy_data which is allocated and freed by each * policy's pd_alloc/free_fn() methods. A policy can allocate private data * area by allocating larger data structure which embeds blkg_policy_data * at the beginning. */ struct blkg_policy_data { /* the blkg and policy id this per-policy data belongs to */ struct blkcg_gq *blkg; int plid; bool online; }; /* * Policies that need to keep per-blkcg data which is independent from any * request_queue associated to it should implement cpd_alloc/free_fn() * methods. A policy can allocate private data area by allocating larger * data structure which embeds blkcg_policy_data at the beginning. * cpd_init() is invoked to let each policy handle per-blkcg data. */ struct blkcg_policy_data { /* the blkcg and policy id this per-policy data belongs to */ struct blkcg *blkcg; int plid; }; typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd); typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp); typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, struct seq_file *s); struct blkcg_policy { int plid; /* cgroup files for the policy */ struct cftype *dfl_cftypes; struct cftype *legacy_cftypes; /* operations */ blkcg_pol_alloc_cpd_fn *cpd_alloc_fn; blkcg_pol_free_cpd_fn *cpd_free_fn; blkcg_pol_alloc_pd_fn *pd_alloc_fn; blkcg_pol_init_pd_fn *pd_init_fn; blkcg_pol_online_pd_fn *pd_online_fn; blkcg_pol_offline_pd_fn *pd_offline_fn; blkcg_pol_free_pd_fn *pd_free_fn; blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; blkcg_pol_stat_pd_fn *pd_stat_fn; }; extern struct blkcg blkcg_root; extern bool blkcg_debug_stats; int blkcg_init_disk(struct gendisk *disk); void blkcg_exit_disk(struct gendisk *disk); /* Blkio controller policy registration */ int blkcg_policy_register(struct blkcg_policy *pol); void blkcg_policy_unregister(struct blkcg_policy *pol); int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol); void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol); const char *blkg_dev_name(struct blkcg_gq *blkg); void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int), const struct blkcg_policy *pol, int data, bool show_total); u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); struct blkg_conf_ctx { char *input; char *body; struct block_device *bdev; struct blkcg_gq *blkg; }; void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input); int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx); int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkg_conf_ctx *ctx); void blkg_conf_exit(struct blkg_conf_ctx *ctx); /** * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg * @return: true if this bio needs to be submitted with the root blkg context. * * In order to avoid priority inversions we sometimes need to issue a bio as if * it were attached to the root blkg, and then backcharge to the actual owning * blkg. The idea is we do bio_blkcg_css() to look up the actual context for * the bio and attach the appropriate blkg to the bio. Then we call this helper * and if it is true run with the root blkg for that queue and then do any * backcharging to the originating cgroup once the io is complete. */ static inline bool bio_issue_as_root_blkg(struct bio *bio) { return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0; } /** * blkg_lookup - lookup blkg for the specified blkcg - q pair * @blkcg: blkcg of interest * @q: request_queue of interest * * Lookup blkg for the @blkcg - @q pair. * Must be called in a RCU critical section. */ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) { struct blkcg_gq *blkg; if (blkcg == &blkcg_root) return q->root_blkg; blkg = rcu_dereference_check(blkcg->blkg_hint, lockdep_is_held(&q->queue_lock)); if (blkg && blkg->q == q) return blkg; blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); if (blkg && blkg->q != q) blkg = NULL; return blkg; } /** * blkg_to_pdata - get policy private data * @blkg: blkg of interest * @pol: policy of interest * * Return pointer to private data associated with the @blkg-@pol pair. */ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return blkg ? blkg->pd[pol->plid] : NULL; } static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg, struct blkcg_policy *pol) { return blkcg ? blkcg->cpd[pol->plid] : NULL; } /** * pdata_to_blkg - get blkg associated with policy private data * @pd: policy private data of interest * * @pd is policy private data. Determine the blkg it's associated with. */ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return pd ? pd->blkg : NULL; } static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) { return cpd ? cpd->blkcg : NULL; } /** * blkg_path - format cgroup path of blkg * @blkg: blkg of interest * @buf: target buffer * @buflen: target buffer length * * Format the path of the cgroup of @blkg into @buf. */ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) { return cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); } /** * blkg_get - get a blkg reference * @blkg: blkg to get * * The caller should be holding an existing reference. */ static inline void blkg_get(struct blkcg_gq *blkg) { percpu_ref_get(&blkg->refcnt); } /** * blkg_tryget - try and get a blkg reference * @blkg: blkg to get * * This is for use when doing an RCU lookup of the blkg. We may be in the midst * of freeing this blkg, so we can only use it if the refcnt is not zero. */ static inline bool blkg_tryget(struct blkcg_gq *blkg) { return blkg && percpu_ref_tryget(&blkg->refcnt); } /** * blkg_put - put a blkg reference * @blkg: blkg to put */ static inline void blkg_put(struct blkcg_gq *blkg) { percpu_ref_put(&blkg->refcnt); } /** * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant * @pos_css: used for iteration * @p_blkg: target blkg to walk descendants of * * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU * read locked. If called under either blkcg or queue lock, the iteration * is guaranteed to include all and only online blkgs. The caller may * update @pos_css by calling css_rightmost_descendant() to skip subtree. * @p_blkg is included in the iteration and the first node to be visited. */ #define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q))) /** * blkg_for_each_descendant_post - post-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant * @pos_css: used for iteration * @p_blkg: target blkg to walk descendants of * * Similar to blkg_for_each_descendant_pre() but performs post-order * traversal instead. Synchronization rules are the same. @p_blkg is * included in the iteration and the last node to be visited. */ #define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q))) static inline void blkcg_bio_issue_init(struct bio *bio) { bio_issue_init(&bio->bi_issue, bio_sectors(bio)); } static inline void blkcg_use_delay(struct blkcg_gq *blkg) { if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) return; if (atomic_add_return(1, &blkg->use_delay) == 1) atomic_inc(&blkg->blkcg->css.cgroup->congestion_count); } static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) { int old = atomic_read(&blkg->use_delay); if (WARN_ON_ONCE(old < 0)) return 0; if (old == 0) return 0; /* * We do this song and dance because we can race with somebody else * adding or removing delay. If we just did an atomic_dec we'd end up * negative and we'd already be in trouble. We need to subtract 1 and * then check to see if we were the last delay so we can drop the * congestion count on the cgroup. */ while (old && !atomic_try_cmpxchg(&blkg->use_delay, &old, old - 1)) ; if (old == 0) return 0; if (old == 1) atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); return 1; } /** * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount * @blkg: target blkg * @delay: delay duration in nsecs * * When enabled with this function, the delay is not decayed and must be * explicitly cleared with blkcg_clear_delay(). Must not be mixed with * blkcg_[un]use_delay() and blkcg_add_delay() usages. */ static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay) { int old = atomic_read(&blkg->use_delay); /* We only want 1 person setting the congestion count for this blkg. */ if (!old && atomic_try_cmpxchg(&blkg->use_delay, &old, -1)) atomic_inc(&blkg->blkcg->css.cgroup->congestion_count); atomic64_set(&blkg->delay_nsec, delay); } /** * blkcg_clear_delay - Disable allocator delay mechanism * @blkg: target blkg * * Disable use_delay mechanism. See blkcg_set_delay(). */ static inline void blkcg_clear_delay(struct blkcg_gq *blkg) { int old = atomic_read(&blkg->use_delay); /* We only want 1 person clearing the congestion count for this blkg. */ if (old && atomic_try_cmpxchg(&blkg->use_delay, &old, 0)) atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); } /** * blk_cgroup_mergeable - Determine whether to allow or disallow merges * @rq: request to merge into * @bio: bio to merge * * @bio and @rq should belong to the same cgroup and their issue_as_root should * match. The latter is necessary as we don't want to throttle e.g. a metadata * update because it happens to be next to a regular IO. */ static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return rq->bio->bi_blkg == bio->bi_blkg && bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio); } void blk_cgroup_bio_start(struct bio *bio); void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta); #else /* CONFIG_BLK_CGROUP */ struct blkg_policy_data { }; struct blkcg_policy_data { }; struct blkcg_policy { }; struct blkcg { }; static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } static inline int blkcg_init_disk(struct gendisk *disk) { return 0; } static inline void blkcg_exit_disk(struct gendisk *disk) { } static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } static inline int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { return 0; } static inline void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { } static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return NULL; } static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } static inline void blkg_get(struct blkcg_gq *blkg) { } static inline void blkg_put(struct blkcg_gq *blkg) { } static inline void blkcg_bio_issue_init(struct bio *bio) { } static inline void blk_cgroup_bio_start(struct bio *bio) { } static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; } #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) #endif /* CONFIG_BLK_CGROUP */ #endif /* _BLK_CGROUP_PRIVATE_H */
508 507 477 478 479 478 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 // SPDX-License-Identifier: GPL-2.0 /* * of.c The helpers for hcd device tree support * * Copyright (C) 2016 Freescale Semiconductor, Inc. * Author: Peter Chen <peter.chen@freescale.com> * Copyright (C) 2017 Johan Hovold <johan@kernel.org> */ #include <linux/of.h> #include <linux/usb/of.h> /** * usb_of_get_device_node() - get a USB device node * @hub: hub to which device is connected * @port1: one-based index of port * * Look up the node of a USB device given its parent hub device and one-based * port number. * * Return: A pointer to the node with incremented refcount if found, or * %NULL otherwise. */ struct device_node *usb_of_get_device_node(struct usb_device *hub, int port1) { struct device_node *node; u32 reg; for_each_child_of_node(hub->dev.of_node, node) { if (of_property_read_u32(node, "reg", &reg)) continue; if (reg == port1) return node; } return NULL; } EXPORT_SYMBOL_GPL(usb_of_get_device_node); /** * usb_of_has_combined_node() - determine whether a device has a combined node * @udev: USB device * * Determine whether a USB device has a so called combined node which is * shared with its sole interface. This is the case if and only if the device * has a node and its descriptors report the following: * * 1) bDeviceClass is 0 or 9, and * 2) bNumConfigurations is 1, and * 3) bNumInterfaces is 1. * * Return: True iff the device has a device node and its descriptors match the * criteria for a combined node. */ bool usb_of_has_combined_node(struct usb_device *udev) { struct usb_device_descriptor *ddesc = &udev->descriptor; struct usb_config_descriptor *cdesc; if (!udev->dev.of_node) return false; switch (ddesc->bDeviceClass) { case USB_CLASS_PER_INTERFACE: case USB_CLASS_HUB: if (ddesc->bNumConfigurations == 1) { cdesc = &udev->config->desc; if (cdesc->bNumInterfaces == 1) return true; } } return false; } EXPORT_SYMBOL_GPL(usb_of_has_combined_node); /** * usb_of_get_interface_node() - get a USB interface node * @udev: USB device of interface * @config: configuration value * @ifnum: interface number * * Look up the node of a USB interface given its USB device, configuration * value and interface number. * * Return: A pointer to the node with incremented refcount if found, or * %NULL otherwise. */ struct device_node * usb_of_get_interface_node(struct usb_device *udev, u8 config, u8 ifnum) { struct device_node *node; u32 reg[2]; for_each_child_of_node(udev->dev.of_node, node) { if (of_property_read_u32_array(node, "reg", reg, 2)) continue; if (reg[0] == ifnum && reg[1] == config) return node; } return NULL; } EXPORT_SYMBOL_GPL(usb_of_get_interface_node);
4 27 5 11 12 3 6 24 11 31 17 20 3 3 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2020-2023 Intel Corporation */ #include <linux/kernel.h> #include <linux/device.h> #include <linux/if.h> #include <linux/if_ether.h> #include <linux/interrupt.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/notifier.h> #include <net/mac80211.h> #include <net/cfg80211.h> #include "ieee80211_i.h" #include "rate.h" #include "debugfs.h" #include "debugfs_netdev.h" #include "driver-ops.h" struct ieee80211_if_read_sdata_data { ssize_t (*format)(const struct ieee80211_sub_if_data *, char *, int); struct ieee80211_sub_if_data *sdata; }; static ssize_t ieee80211_if_read_sdata_handler(struct wiphy *wiphy, struct file *file, char *buf, size_t bufsize, void *data) { struct ieee80211_if_read_sdata_data *d = data; return d->format(d->sdata, buf, bufsize); } static ssize_t ieee80211_if_read_sdata( struct file *file, char __user *userbuf, size_t count, loff_t *ppos, ssize_t (*format)(const struct ieee80211_sub_if_data *sdata, char *, int)) { struct ieee80211_sub_if_data *sdata = file->private_data; struct ieee80211_if_read_sdata_data data = { .format = format, .sdata = sdata, }; char buf[200]; return wiphy_locked_debugfs_read(sdata->local->hw.wiphy, file, buf, sizeof(buf), userbuf, count, ppos, ieee80211_if_read_sdata_handler, &data); } struct ieee80211_if_write_sdata_data { ssize_t (*write)(struct ieee80211_sub_if_data *, const char *, int); struct ieee80211_sub_if_data *sdata; }; static ssize_t ieee80211_if_write_sdata_handler(struct wiphy *wiphy, struct file *file, char *buf, size_t count, void *data) { struct ieee80211_if_write_sdata_data *d = data; return d->write(d->sdata, buf, count); } static ssize_t ieee80211_if_write_sdata( struct file *file, const char __user *userbuf, size_t count, loff_t *ppos, ssize_t (*write)(struct ieee80211_sub_if_data *sdata, const char *, int)) { struct ieee80211_sub_if_data *sdata = file->private_data; struct ieee80211_if_write_sdata_data data = { .write = write, .sdata = sdata, }; char buf[64]; return wiphy_locked_debugfs_write(sdata->local->hw.wiphy, file, buf, sizeof(buf), userbuf, count, ieee80211_if_write_sdata_handler, &data); } struct ieee80211_if_read_link_data { ssize_t (*format)(const struct ieee80211_link_data *, char *, int); struct ieee80211_link_data *link; }; static ssize_t ieee80211_if_read_link_handler(struct wiphy *wiphy, struct file *file, char *buf, size_t bufsize, void *data) { struct ieee80211_if_read_link_data *d = data; return d->format(d->link, buf, bufsize); } static ssize_t ieee80211_if_read_link( struct file *file, char __user *userbuf, size_t count, loff_t *ppos, ssize_t (*format)(const struct ieee80211_link_data *link, char *, int)) { struct ieee80211_link_data *link = file->private_data; struct ieee80211_if_read_link_data data = { .format = format, .link = link, }; char buf[200]; return wiphy_locked_debugfs_read(link->sdata->local->hw.wiphy, file, buf, sizeof(buf), userbuf, count, ppos, ieee80211_if_read_link_handler, &data); } struct ieee80211_if_write_link_data { ssize_t (*write)(struct ieee80211_link_data *, const char *, int); struct ieee80211_link_data *link; }; static ssize_t ieee80211_if_write_link_handler(struct wiphy *wiphy, struct file *file, char *buf, size_t count, void *data) { struct ieee80211_if_write_sdata_data *d = data; return d->write(d->sdata, buf, count); } static ssize_t ieee80211_if_write_link( struct file *file, const char __user *userbuf, size_t count, loff_t *ppos, ssize_t (*write)(struct ieee80211_link_data *link, const char *, int)) { struct ieee80211_link_data *link = file->private_data; struct ieee80211_if_write_link_data data = { .write = write, .link = link, }; char buf[64]; return wiphy_locked_debugfs_write(link->sdata->local->hw.wiphy, file, buf, sizeof(buf), userbuf, count, ieee80211_if_write_link_handler, &data); } #define IEEE80211_IF_FMT(name, type, field, format_string) \ static ssize_t ieee80211_if_fmt_##name( \ const type *data, char *buf, \ int buflen) \ { \ return scnprintf(buf, buflen, format_string, data->field); \ } #define IEEE80211_IF_FMT_DEC(name, type, field) \ IEEE80211_IF_FMT(name, type, field, "%d\n") #define IEEE80211_IF_FMT_HEX(name, type, field) \ IEEE80211_IF_FMT(name, type, field, "%#x\n") #define IEEE80211_IF_FMT_LHEX(name, type, field) \ IEEE80211_IF_FMT(name, type, field, "%#lx\n") #define IEEE80211_IF_FMT_HEXARRAY(name, type, field) \ static ssize_t ieee80211_if_fmt_##name( \ const type *data, \ char *buf, int buflen) \ { \ char *p = buf; \ int i; \ for (i = 0; i < sizeof(data->field); i++) { \ p += scnprintf(p, buflen + buf - p, "%.2x ", \ data->field[i]); \ } \ p += scnprintf(p, buflen + buf - p, "\n"); \ return p - buf; \ } #define IEEE80211_IF_FMT_ATOMIC(name, type, field) \ static ssize_t ieee80211_if_fmt_##name( \ const type *data, \ char *buf, int buflen) \ { \ return scnprintf(buf, buflen, "%d\n", atomic_read(&data->field));\ } #define IEEE80211_IF_FMT_MAC(name, type, field) \ static ssize_t ieee80211_if_fmt_##name( \ const type *data, char *buf, \ int buflen) \ { \ return scnprintf(buf, buflen, "%pM\n", data->field); \ } #define IEEE80211_IF_FMT_JIFFIES_TO_MS(name, type, field) \ static ssize_t ieee80211_if_fmt_##name( \ const type *data, \ char *buf, int buflen) \ { \ return scnprintf(buf, buflen, "%d\n", \ jiffies_to_msecs(data->field)); \ } #define _IEEE80211_IF_FILE_OPS(name, _read, _write) \ static const struct file_operations name##_ops = { \ .read = (_read), \ .write = (_write), \ .open = simple_open, \ .llseek = generic_file_llseek, \ } #define _IEEE80211_IF_FILE_R_FN(name) \ static ssize_t ieee80211_if_read_##name(struct file *file, \ char __user *userbuf, \ size_t count, loff_t *ppos) \ { \ return ieee80211_if_read_sdata(file, \ userbuf, count, ppos, \ ieee80211_if_fmt_##name); \ } #define _IEEE80211_IF_FILE_W_FN(name) \ static ssize_t ieee80211_if_write_##name(struct file *file, \ const char __user *userbuf, \ size_t count, loff_t *ppos) \ { \ return ieee80211_if_write_s