26 1 2 24 25 4 4 2 2 7 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 | /* * Routines to compress and uncompress tcp packets (for transmission * over low speed serial lines). * * Copyright (c) 1989 Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms are permitted * provided that the above copyright notice and this paragraph are * duplicated in all such forms and that any documentation, * advertising materials, and other materials related to such * distribution and use acknowledge that the software was developed * by the University of California, Berkeley. The name of the * University may not be used to endorse or promote products derived * from this software without specific prior written permission. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. * * Van Jacobson (van@helios.ee.lbl.gov), Dec 31, 1989: * - Initial distribution. * * * modified for KA9Q Internet Software Package by * Katie Stevens (dkstevens@ucdavis.edu) * University of California, Davis * Computing Services * - 01-31-90 initial adaptation (from 1.19) * PPP.05 02-15-90 [ks] * PPP.08 05-02-90 [ks] use PPP protocol field to signal compression * PPP.15 09-90 [ks] improve mbuf handling * PPP.16 11-02 [karn] substantially rewritten to use NOS facilities * * - Feb 1991 Bill_Simpson@um.cc.umich.edu * variable number of conversation slots * allow zero or one slots * separate routines * status display * - Jul 1994 Dmitry Gorodchanin * Fixes for memory leaks. * - Oct 1994 Dmitry Gorodchanin * Modularization. * - Jan 1995 Bjorn Ekwall * Use ip_fast_csum from ip.h * - July 1995 Christos A. Polyzols * Spotted bug in tcp option checking * * * This module is a difficult issue. It's clearly inet code but it's also clearly * driver code belonging close to PPP and SLIP */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/kernel.h> #include <net/slhc_vj.h> #ifdef CONFIG_INET /* Entire module is for IP only */ #include <linux/mm.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/termios.h> #include <linux/in.h> #include <linux/fcntl.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <net/ip.h> #include <net/protocol.h> #include <net/icmp.h> #include <net/tcp.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/timer.h> #include <linux/uaccess.h> #include <net/checksum.h> #include <asm/unaligned.h> static unsigned char *encode(unsigned char *cp, unsigned short n); static long decode(unsigned char **cpp); static unsigned char * put16(unsigned char *cp, unsigned short x); static unsigned short pull16(unsigned char **cpp); /* Allocate compression data structure * slots must be in range 0 to 255 (zero meaning no compression) * Returns pointer to structure or ERR_PTR() on error. */ struct slcompress * slhc_init(int rslots, int tslots) { short i; struct cstate *ts; struct slcompress *comp; if (rslots < 0 || rslots > 255 || tslots < 0 || tslots > 255) return ERR_PTR(-EINVAL); comp = kzalloc(sizeof(struct slcompress), GFP_KERNEL); if (! comp) goto out_fail; if (rslots > 0) { size_t rsize = rslots * sizeof(struct cstate); comp->rstate = kzalloc(rsize, GFP_KERNEL); if (! comp->rstate) goto out_free; comp->rslot_limit = rslots - 1; } if (tslots > 0) { size_t tsize = tslots * sizeof(struct cstate); comp->tstate = kzalloc(tsize, GFP_KERNEL); if (! comp->tstate) goto out_free2; comp->tslot_limit = tslots - 1; } comp->xmit_oldest = 0; comp->xmit_current = 255; comp->recv_current = 255; /* * don't accept any packets with implicit index until we get * one with an explicit index. Otherwise the uncompress code * will try to use connection 255, which is almost certainly * out of range */ comp->flags |= SLF_TOSS; if ( tslots > 0 ) { ts = comp->tstate; for(i = comp->tslot_limit; i > 0; --i){ ts[i].cs_this = i; ts[i].next = &(ts[i - 1]); } ts[0].next = &(ts[comp->tslot_limit]); ts[0].cs_this = 0; } return comp; out_free2: kfree(comp->rstate); out_free: kfree(comp); out_fail: return ERR_PTR(-ENOMEM); } /* Free a compression data structure */ void slhc_free(struct slcompress *comp) { if ( IS_ERR_OR_NULL(comp) ) return; if ( comp->tstate != NULLSLSTATE ) kfree( comp->tstate ); if ( comp->rstate != NULLSLSTATE ) kfree( comp->rstate ); kfree( comp ); } /* Put a short in host order into a char array in network order */ static inline unsigned char * put16(unsigned char *cp, unsigned short x) { *cp++ = x >> 8; *cp++ = x; return cp; } /* Encode a number */ static unsigned char * encode(unsigned char *cp, unsigned short n) { if(n >= 256 || n == 0){ *cp++ = 0; cp = put16(cp,n); } else { *cp++ = n; } return cp; } /* Pull a 16-bit integer in host order from buffer in network byte order */ static unsigned short pull16(unsigned char **cpp) { short rval; rval = *(*cpp)++; rval <<= 8; rval |= *(*cpp)++; return rval; } /* Decode a number */ static long decode(unsigned char **cpp) { int x; x = *(*cpp)++; if(x == 0){ return pull16(cpp) & 0xffff; /* pull16 returns -1 on error */ } else { return x & 0xff; /* -1 if PULLCHAR returned error */ } } /* * icp and isize are the original packet. * ocp is a place to put a copy if necessary. * cpp is initially a pointer to icp. If the copy is used, * change it to ocp. */ int slhc_compress(struct slcompress *comp, unsigned char *icp, int isize, unsigned char *ocp, unsigned char **cpp, int compress_cid) { struct cstate *ocs = &(comp->tstate[comp->xmit_oldest]); struct cstate *lcs = ocs; struct cstate *cs = lcs->next; unsigned long deltaS, deltaA; short changes = 0; int nlen, hlen; unsigned char new_seq[16]; unsigned char *cp = new_seq; struct iphdr *ip; struct tcphdr *th, *oth; __sum16 csum; /* * Don't play with runt packets. */ if(isize<sizeof(struct iphdr)) return isize; ip = (struct iphdr *) icp; if (ip->version != 4 || ip->ihl < 5) return isize; /* Bail if this packet isn't TCP, or is an IP fragment */ if (ip->protocol != IPPROTO_TCP || (ntohs(ip->frag_off) & 0x3fff)) { /* Send as regular IP */ if(ip->protocol != IPPROTO_TCP) comp->sls_o_nontcp++; else comp->sls_o_tcp++; return isize; } nlen = ip->ihl * 4; if (isize < nlen + sizeof(*th)) return isize; th = (struct tcphdr *)(icp + nlen); if (th->doff < sizeof(struct tcphdr) / 4) return isize; hlen = nlen + th->doff * 4; /* Bail if the TCP packet isn't `compressible' (i.e., ACK isn't set or * some other control bit is set). Also uncompressible if * it's a runt. */ if(hlen > isize || th->syn || th->fin || th->rst || ! (th->ack)){ /* TCP connection stuff; send as regular IP */ comp->sls_o_tcp++; return isize; } /* * Packet is compressible -- we're going to send either a * COMPRESSED_TCP or UNCOMPRESSED_TCP packet. Either way, * we need to locate (or create) the connection state. * * States are kept in a circularly linked list with * xmit_oldest pointing to the end of the list. The * list is kept in lru order by moving a state to the * head of the list whenever it is referenced. Since * the list is short and, empirically, the connection * we want is almost always near the front, we locate * states via linear search. If we don't find a state * for the datagram, the oldest state is (re-)used. */ for ( ; ; ) { if( ip->saddr == cs->cs_ip.saddr && ip->daddr == cs->cs_ip.daddr && th->source == cs->cs_tcp.source && th->dest == cs->cs_tcp.dest) goto found; /* if current equal oldest, at end of list */ if ( cs == ocs ) break; lcs = cs; cs = cs->next; comp->sls_o_searches++; } /* * Didn't find it -- re-use oldest cstate. Send an * uncompressed packet that tells the other side what * connection number we're using for this conversation. * * Note that since the state list is circular, the oldest * state points to the newest and we only need to set * xmit_oldest to update the lru linkage. */ comp->sls_o_misses++; comp->xmit_oldest = lcs->cs_this; goto uncompressed; found: /* * Found it -- move to the front on the connection list. */ if(lcs == ocs) { /* found at most recently used */ } else if (cs == ocs) { /* found at least recently used */ comp->xmit_oldest = lcs->cs_this; } else { /* more than 2 elements */ lcs->next = cs->next; cs->next = ocs->next; ocs->next = cs; } /* * Make sure that only what we expect to change changed. * Check the following: * IP protocol version, header length & type of service. * The "Don't fragment" bit. * The time-to-live field. * The TCP header length. * IP options, if any. * TCP options, if any. * If any of these things are different between the previous & * current datagram, we send the current datagram `uncompressed'. */ oth = &cs->cs_tcp; if(ip->version != cs->cs_ip.version || ip->ihl != cs->cs_ip.ihl || ip->tos != cs->cs_ip.tos || (ip->frag_off & htons(0x4000)) != (cs->cs_ip.frag_off & htons(0x4000)) || ip->ttl != cs->cs_ip.ttl || th->doff != cs->cs_tcp.doff || (ip->ihl > 5 && memcmp(ip+1,cs->cs_ipopt,((ip->ihl)-5)*4) != 0) || (th->doff > 5 && memcmp(th+1,cs->cs_tcpopt,((th->doff)-5)*4) != 0)){ goto uncompressed; } /* * Figure out which of the changing fields changed. The * receiver expects changes in the order: urgent, window, * ack, seq (the order minimizes the number of temporaries * needed in this section of code). */ if(th->urg){ deltaS = ntohs(th->urg_ptr); cp = encode(cp,deltaS); changes |= NEW_U; } else if(th->urg_ptr != oth->urg_ptr){ /* argh! URG not set but urp changed -- a sensible * implementation should never do this but RFC793 * doesn't prohibit the change so we have to deal * with it. */ goto uncompressed; } if((deltaS = ntohs(th->window) - ntohs(oth->window)) != 0){ cp = encode(cp,deltaS); changes |= NEW_W; } if((deltaA = ntohl(th->ack_seq) - ntohl(oth->ack_seq)) != 0L){ if(deltaA > 0x0000ffff) goto uncompressed; cp = encode(cp,deltaA); changes |= NEW_A; } if((deltaS = ntohl(th->seq) - ntohl(oth->seq)) != 0L){ if(deltaS > 0x0000ffff) goto uncompressed; cp = encode(cp,deltaS); changes |= NEW_S; } switch(changes){ case 0: /* Nothing changed. If this packet contains data and the * last one didn't, this is probably a data packet following * an ack (normal on an interactive connection) and we send * it compressed. Otherwise it's probably a retransmit, * retransmitted ack or window probe. Send it uncompressed * in case the other side missed the compressed version. */ if(ip->tot_len != cs->cs_ip.tot_len && ntohs(cs->cs_ip.tot_len) == hlen) break; goto uncompressed; case SPECIAL_I: case SPECIAL_D: /* actual changes match one of our special case encodings -- * send packet uncompressed. */ goto uncompressed; case NEW_S|NEW_A: if(deltaS == deltaA && deltaS == ntohs(cs->cs_ip.tot_len) - hlen){ /* special case for echoed terminal traffic */ changes = SPECIAL_I; cp = new_seq; } break; case NEW_S: if(deltaS == ntohs(cs->cs_ip.tot_len) - hlen){ /* special case for data xfer */ changes = SPECIAL_D; cp = new_seq; } break; } deltaS = ntohs(ip->id) - ntohs(cs->cs_ip.id); if(deltaS != 1){ cp = encode(cp,deltaS); changes |= NEW_I; } if(th->psh) changes |= TCP_PUSH_BIT; /* Grab the cksum before we overwrite it below. Then update our * state with this packet's header. */ csum = th->check; memcpy(&cs->cs_ip,ip,20); memcpy(&cs->cs_tcp,th,20); /* We want to use the original packet as our compressed packet. * (cp - new_seq) is the number of bytes we need for compressed * sequence numbers. In addition we need one byte for the change * mask, one for the connection id and two for the tcp checksum. * So, (cp - new_seq) + 4 bytes of header are needed. */ deltaS = cp - new_seq; if(compress_cid == 0 || comp->xmit_current != cs->cs_this){ cp = ocp; *cpp = ocp; *cp++ = changes | NEW_C; *cp++ = cs->cs_this; comp->xmit_current = cs->cs_this; } else { cp = ocp; *cpp = ocp; *cp++ = changes; } *(__sum16 *)cp = csum; cp += 2; /* deltaS is now the size of the change section of the compressed header */ memcpy(cp,new_seq,deltaS); /* Write list of deltas */ memcpy(cp+deltaS,icp+hlen,isize-hlen); comp->sls_o_compressed++; ocp[0] |= SL_TYPE_COMPRESSED_TCP; return isize - hlen + deltaS + (cp - ocp); /* Update connection state cs & send uncompressed packet (i.e., * a regular ip/tcp packet but with the 'conversation id' we hope * to use on future compressed packets in the protocol field). */ uncompressed: memcpy(&cs->cs_ip,ip,20); memcpy(&cs->cs_tcp,th,20); if (ip->ihl > 5) memcpy(cs->cs_ipopt, ip+1, ((ip->ihl) - 5) * 4); if (th->doff > 5) memcpy(cs->cs_tcpopt, th+1, ((th->doff) - 5) * 4); comp->xmit_current = cs->cs_this; comp->sls_o_uncompressed++; memcpy(ocp, icp, isize); *cpp = ocp; ocp[9] = cs->cs_this; ocp[0] |= SL_TYPE_UNCOMPRESSED_TCP; return isize; } int slhc_uncompress(struct slcompress *comp, unsigned char *icp, int isize) { int changes; long x; struct tcphdr *thp; struct iphdr *ip; struct cstate *cs; int len, hdrlen; unsigned char *cp = icp; /* We've got a compressed packet; read the change byte */ comp->sls_i_compressed++; if(isize < 3){ comp->sls_i_error++; return 0; } changes = *cp++; if(changes & NEW_C){ /* Make sure the state index is in range, then grab the state. * If we have a good state index, clear the 'discard' flag. */ x = *cp++; /* Read conn index */ if(x < 0 || x > comp->rslot_limit) goto bad; /* Check if the cstate is initialized */ if (!comp->rstate[x].initialized) goto bad; comp->flags &=~ SLF_TOSS; comp->recv_current = x; } else { /* this packet has an implicit state index. If we've * had a line error since the last time we got an * explicit state index, we have to toss the packet. */ if(comp->flags & SLF_TOSS){ comp->sls_i_tossed++; return 0; } } cs = &comp->rstate[comp->recv_current]; thp = &cs->cs_tcp; ip = &cs->cs_ip; thp->check = *(__sum16 *)cp; cp += 2; thp->psh = (changes & TCP_PUSH_BIT) ? 1 : 0; /* * we can use the same number for the length of the saved header and * the current one, because the packet wouldn't have been sent * as compressed unless the options were the same as the previous one */ hdrlen = ip->ihl * 4 + thp->doff * 4; switch(changes & SPECIALS_MASK){ case SPECIAL_I: /* Echoed terminal traffic */ { short i; i = ntohs(ip->tot_len) - hdrlen; thp->ack_seq = htonl( ntohl(thp->ack_seq) + i); thp->seq = htonl( ntohl(thp->seq) + i); } break; case SPECIAL_D: /* Unidirectional data */ thp->seq = htonl( ntohl(thp->seq) + ntohs(ip->tot_len) - hdrlen); break; default: if(changes & NEW_U){ thp->urg = 1; if((x = decode(&cp)) == -1) { goto bad; } thp->urg_ptr = htons(x); } else thp->urg = 0; if(changes & NEW_W){ if((x = decode(&cp)) == -1) { goto bad; } thp->window = htons( ntohs(thp->window) + x); } if(changes & NEW_A){ if((x = decode(&cp)) == -1) { goto bad; } thp->ack_seq = htonl( ntohl(thp->ack_seq) + x); } if(changes & NEW_S){ if((x = decode(&cp)) == -1) { goto bad; } thp->seq = htonl( ntohl(thp->seq) + x); } break; } if(changes & NEW_I){ if((x = decode(&cp)) == -1) { goto bad; } ip->id = htons (ntohs (ip->id) + x); } else ip->id = htons (ntohs (ip->id) + 1); /* * At this point, cp points to the first byte of data in the * packet. Put the reconstructed TCP and IP headers back on the * packet. Recalculate IP checksum (but not TCP checksum). */ len = isize - (cp - icp); if (len < 0) goto bad; len += hdrlen; ip->tot_len = htons(len); ip->check = 0; memmove(icp + hdrlen, cp, len - hdrlen); cp = icp; memcpy(cp, ip, 20); cp += 20; if (ip->ihl > 5) { memcpy(cp, cs->cs_ipopt, (ip->ihl - 5) * 4); cp += (ip->ihl - 5) * 4; } put_unaligned(ip_fast_csum(icp, ip->ihl), &((struct iphdr *)icp)->check); memcpy(cp, thp, 20); cp += 20; if (thp->doff > 5) { memcpy(cp, cs->cs_tcpopt, ((thp->doff) - 5) * 4); cp += ((thp->doff) - 5) * 4; } return len; bad: comp->sls_i_error++; return slhc_toss( comp ); } int slhc_remember(struct slcompress *comp, unsigned char *icp, int isize) { const struct tcphdr *th; unsigned char index; struct iphdr *iph; struct cstate *cs; unsigned int ihl; /* The packet is shorter than a legal IP header. * Also make sure isize is positive. */ if (isize < (int)sizeof(struct iphdr)) { runt: comp->sls_i_runt++; return slhc_toss(comp); } iph = (struct iphdr *)icp; /* Peek at the IP header's IHL field to find its length */ ihl = iph->ihl; /* The IP header length field is too small, * or packet is shorter than the IP header followed * by minimal tcp header. */ if (ihl < 5 || isize < ihl * 4 + sizeof(struct tcphdr)) goto runt; index = iph->protocol; iph->protocol = IPPROTO_TCP; if (ip_fast_csum(icp, ihl)) { /* Bad IP header checksum; discard */ comp->sls_i_badcheck++; return slhc_toss(comp); } if (index > comp->rslot_limit) { comp->sls_i_error++; return slhc_toss(comp); } th = (struct tcphdr *)(icp + ihl * 4); if (th->doff < sizeof(struct tcphdr) / 4) goto runt; if (isize < ihl * 4 + th->doff * 4) goto runt; /* Update local state */ cs = &comp->rstate[comp->recv_current = index]; comp->flags &=~ SLF_TOSS; memcpy(&cs->cs_ip, iph, sizeof(*iph)); memcpy(&cs->cs_tcp, th, sizeof(*th)); if (ihl > 5) memcpy(cs->cs_ipopt, &iph[1], (ihl - 5) * 4); if (th->doff > 5) memcpy(cs->cs_tcpopt, &th[1], (th->doff - 5) * 4); cs->cs_hsize = ihl*2 + th->doff*2; cs->initialized = true; /* Put headers back on packet * Neither header checksum is recalculated */ comp->sls_i_uncompressed++; return isize; } int slhc_toss(struct slcompress *comp) { if ( comp == NULLSLCOMPR ) return 0; comp->flags |= SLF_TOSS; return 0; } #else /* CONFIG_INET */ int slhc_toss(struct slcompress *comp) { printk(KERN_DEBUG "Called IP function on non IP-system: slhc_toss"); return -EINVAL; } int slhc_uncompress(struct slcompress *comp, unsigned char *icp, int isize) { printk(KERN_DEBUG "Called IP function on non IP-system: slhc_uncompress"); return -EINVAL; } int slhc_compress(struct slcompress *comp, unsigned char *icp, int isize, unsigned char *ocp, unsigned char **cpp, int compress_cid) { printk(KERN_DEBUG "Called IP function on non IP-system: slhc_compress"); return -EINVAL; } int slhc_remember(struct slcompress *comp, unsigned char *icp, int isize) { printk(KERN_DEBUG "Called IP function on non IP-system: slhc_remember"); return -EINVAL; } void slhc_free(struct slcompress *comp) { printk(KERN_DEBUG "Called IP function on non IP-system: slhc_free"); } struct slcompress * slhc_init(int rslots, int tslots) { printk(KERN_DEBUG "Called IP function on non IP-system: slhc_init"); return NULL; } #endif /* CONFIG_INET */ /* VJ header compression */ EXPORT_SYMBOL(slhc_init); EXPORT_SYMBOL(slhc_free); EXPORT_SYMBOL(slhc_remember); EXPORT_SYMBOL(slhc_compress); EXPORT_SYMBOL(slhc_uncompress); EXPORT_SYMBOL(slhc_toss); MODULE_LICENSE("Dual BSD/GPL"); |
14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | // SPDX-License-Identifier: GPL-2.0-only /* x_tables module for setting the IPv4/IPv6 DSCP field, Version 1.8 * * (C) 2002 by Harald Welte <laforge@netfilter.org> * based on ipt_FTOS.c (C) 2000 by Matthew G. Marsh <mgm@paktronix.com> * * See RFC2474 for a description of the DSCP field within the IP Header. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/dsfield.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_DSCP.h> MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("Xtables: DSCP/TOS field modification"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_DSCP"); MODULE_ALIAS("ip6t_DSCP"); MODULE_ALIAS("ipt_TOS"); MODULE_ALIAS("ip6t_TOS"); #define XT_DSCP_ECN_MASK 3u static unsigned int dscp_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_DSCP_info *dinfo = par->targinfo; u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; if (dscp != dinfo->dscp) { if (skb_ensure_writable(skb, sizeof(struct iphdr))) return NF_DROP; ipv4_change_dsfield(ip_hdr(skb), XT_DSCP_ECN_MASK, dinfo->dscp << XT_DSCP_SHIFT); } return XT_CONTINUE; } static unsigned int dscp_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_DSCP_info *dinfo = par->targinfo; u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; if (dscp != dinfo->dscp) { if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) return NF_DROP; ipv6_change_dsfield(ipv6_hdr(skb), XT_DSCP_ECN_MASK, dinfo->dscp << XT_DSCP_SHIFT); } return XT_CONTINUE; } static int dscp_tg_check(const struct xt_tgchk_param *par) { const struct xt_DSCP_info *info = par->targinfo; if (info->dscp > XT_DSCP_MAX) return -EDOM; return 0; } static unsigned int tos_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tos_target_info *info = par->targinfo; struct iphdr *iph = ip_hdr(skb); u_int8_t orig, nv; orig = ipv4_get_dsfield(iph); nv = (orig & ~info->tos_mask) ^ info->tos_value; if (orig != nv) { if (skb_ensure_writable(skb, sizeof(struct iphdr))) return NF_DROP; iph = ip_hdr(skb); ipv4_change_dsfield(iph, 0, nv); } return XT_CONTINUE; } static unsigned int tos_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tos_target_info *info = par->targinfo; struct ipv6hdr *iph = ipv6_hdr(skb); u_int8_t orig, nv; orig = ipv6_get_dsfield(iph); nv = (orig & ~info->tos_mask) ^ info->tos_value; if (orig != nv) { if (skb_ensure_writable(skb, sizeof(struct iphdr))) return NF_DROP; iph = ipv6_hdr(skb); ipv6_change_dsfield(iph, 0, nv); } return XT_CONTINUE; } static struct xt_target dscp_tg_reg[] __read_mostly = { { .name = "DSCP", .family = NFPROTO_IPV4, .checkentry = dscp_tg_check, .target = dscp_tg, .targetsize = sizeof(struct xt_DSCP_info), .table = "mangle", .me = THIS_MODULE, }, { .name = "DSCP", .family = NFPROTO_IPV6, .checkentry = dscp_tg_check, .target = dscp_tg6, .targetsize = sizeof(struct xt_DSCP_info), .table = "mangle", .me = THIS_MODULE, }, { .name = "TOS", .revision = 1, .family = NFPROTO_IPV4, .table = "mangle", .target = tos_tg, .targetsize = sizeof(struct xt_tos_target_info), .me = THIS_MODULE, }, { .name = "TOS", .revision = 1, .family = NFPROTO_IPV6, .table = "mangle", .target = tos_tg6, .targetsize = sizeof(struct xt_tos_target_info), .me = THIS_MODULE, }, }; static int __init dscp_tg_init(void) { return xt_register_targets(dscp_tg_reg, ARRAY_SIZE(dscp_tg_reg)); } static void __exit dscp_tg_exit(void) { xt_unregister_targets(dscp_tg_reg, ARRAY_SIZE(dscp_tg_reg)); } module_init(dscp_tg_init); module_exit(dscp_tg_exit); |
1 1 1 1 1 2 1 1 1 1 4 2 1 1 2 1 1 2 1 1 3 1 1 1 1 1 2 1 1 1 1 1 2 2 1 1 2 2 4 3 1 3 2 1 1 1 1 2 1 1 409 35 375 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2011 Instituto Nokia de Tecnologia * * Authors: * Lauro Ramos Venancio <lauro.venancio@openbossa.org> * Aloisio Almeida Jr <aloisio.almeida@openbossa.org> * * Vendor commands implementation based on net/wireless/nl80211.c * which is: * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH */ #define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__ #include <net/genetlink.h> #include <linux/nfc.h> #include <linux/slab.h> #include "nfc.h" #include "llcp.h" static const struct genl_multicast_group nfc_genl_mcgrps[] = { { .name = NFC_GENL_MCAST_EVENT_NAME, }, }; static struct genl_family nfc_genl_family; static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = { [NFC_ATTR_DEVICE_INDEX] = { .type = NLA_U32 }, [NFC_ATTR_DEVICE_NAME] = { .type = NLA_STRING, .len = NFC_DEVICE_NAME_MAXSIZE }, [NFC_ATTR_PROTOCOLS] = { .type = NLA_U32 }, [NFC_ATTR_TARGET_INDEX] = { .type = NLA_U32 }, [NFC_ATTR_COMM_MODE] = { .type = NLA_U8 }, [NFC_ATTR_RF_MODE] = { .type = NLA_U8 }, [NFC_ATTR_DEVICE_POWERED] = { .type = NLA_U8 }, [NFC_ATTR_IM_PROTOCOLS] = { .type = NLA_U32 }, [NFC_ATTR_TM_PROTOCOLS] = { .type = NLA_U32 }, [NFC_ATTR_LLC_PARAM_LTO] = { .type = NLA_U8 }, [NFC_ATTR_LLC_PARAM_RW] = { .type = NLA_U8 }, [NFC_ATTR_LLC_PARAM_MIUX] = { .type = NLA_U16 }, [NFC_ATTR_LLC_SDP] = { .type = NLA_NESTED }, [NFC_ATTR_FIRMWARE_NAME] = { .type = NLA_STRING, .len = NFC_FIRMWARE_NAME_MAXSIZE }, [NFC_ATTR_SE_INDEX] = { .type = NLA_U32 }, [NFC_ATTR_SE_APDU] = { .type = NLA_BINARY }, [NFC_ATTR_VENDOR_ID] = { .type = NLA_U32 }, [NFC_ATTR_VENDOR_SUBCMD] = { .type = NLA_U32 }, [NFC_ATTR_VENDOR_DATA] = { .type = NLA_BINARY }, }; static const struct nla_policy nfc_sdp_genl_policy[NFC_SDP_ATTR_MAX + 1] = { [NFC_SDP_ATTR_URI] = { .type = NLA_STRING, .len = U8_MAX - 4 }, [NFC_SDP_ATTR_SAP] = { .type = NLA_U8 }, }; static int nfc_genl_send_target(struct sk_buff *msg, struct nfc_target *target, struct netlink_callback *cb, int flags) { void *hdr; hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &nfc_genl_family, flags, NFC_CMD_GET_TARGET); if (!hdr) return -EMSGSIZE; genl_dump_check_consistent(cb, hdr); if (nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target->idx) || nla_put_u32(msg, NFC_ATTR_PROTOCOLS, target->supported_protocols) || nla_put_u16(msg, NFC_ATTR_TARGET_SENS_RES, target->sens_res) || nla_put_u8(msg, NFC_ATTR_TARGET_SEL_RES, target->sel_res)) goto nla_put_failure; if (target->nfcid1_len > 0 && nla_put(msg, NFC_ATTR_TARGET_NFCID1, target->nfcid1_len, target->nfcid1)) goto nla_put_failure; if (target->sensb_res_len > 0 && nla_put(msg, NFC_ATTR_TARGET_SENSB_RES, target->sensb_res_len, target->sensb_res)) goto nla_put_failure; if (target->sensf_res_len > 0 && nla_put(msg, NFC_ATTR_TARGET_SENSF_RES, target->sensf_res_len, target->sensf_res)) goto nla_put_failure; if (target->is_iso15693) { if (nla_put_u8(msg, NFC_ATTR_TARGET_ISO15693_DSFID, target->iso15693_dsfid) || nla_put(msg, NFC_ATTR_TARGET_ISO15693_UID, sizeof(target->iso15693_uid), target->iso15693_uid)) goto nla_put_failure; } genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static struct nfc_dev *__get_device_from_cb(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct nfc_dev *dev; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return ERR_PTR(-EINVAL); idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return ERR_PTR(-ENODEV); return dev; } static int nfc_genl_dump_targets(struct sk_buff *skb, struct netlink_callback *cb) { int i = cb->args[0]; struct nfc_dev *dev = (struct nfc_dev *) cb->args[1]; int rc; if (!dev) { dev = __get_device_from_cb(cb); if (IS_ERR(dev)) return PTR_ERR(dev); cb->args[1] = (long) dev; } device_lock(&dev->dev); cb->seq = dev->targets_generation; while (i < dev->n_targets) { rc = nfc_genl_send_target(skb, &dev->targets[i], cb, NLM_F_MULTI); if (rc < 0) break; i++; } device_unlock(&dev->dev); cb->args[0] = i; return skb->len; } static int nfc_genl_dump_targets_done(struct netlink_callback *cb) { struct nfc_dev *dev = (struct nfc_dev *) cb->args[1]; if (dev) nfc_put_device(dev); return 0; } int nfc_genl_targets_found(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; dev->genl_data.poll_req_portid = 0; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_TARGETS_FOUND); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_target_lost(struct nfc_dev *dev, u32 target_idx) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_TARGET_LOST); if (!hdr) goto free_msg; if (nla_put_string(msg, NFC_ATTR_DEVICE_NAME, nfc_device_name(dev)) || nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target_idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_tm_activated(struct nfc_dev *dev, u32 protocol) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_TM_ACTIVATED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; if (nla_put_u32(msg, NFC_ATTR_TM_PROTOCOLS, protocol)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_tm_deactivated(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_TM_DEACTIVATED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } static int nfc_genl_setup_device_added(struct nfc_dev *dev, struct sk_buff *msg) { if (nla_put_string(msg, NFC_ATTR_DEVICE_NAME, nfc_device_name(dev)) || nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_PROTOCOLS, dev->supported_protocols) || nla_put_u8(msg, NFC_ATTR_DEVICE_POWERED, dev->dev_up) || nla_put_u8(msg, NFC_ATTR_RF_MODE, dev->rf_mode)) return -1; return 0; } int nfc_genl_device_added(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_DEVICE_ADDED); if (!hdr) goto free_msg; if (nfc_genl_setup_device_added(dev, msg)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_device_removed(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_DEVICE_REMOVED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_llc_send_sdres(struct nfc_dev *dev, struct hlist_head *sdres_list) { struct sk_buff *msg; struct nlattr *sdp_attr, *uri_attr; struct nfc_llcp_sdp_tlv *sdres; struct hlist_node *n; void *hdr; int rc = -EMSGSIZE; int i; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_LLC_SDRES); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; sdp_attr = nla_nest_start_noflag(msg, NFC_ATTR_LLC_SDP); if (sdp_attr == NULL) { rc = -ENOMEM; goto nla_put_failure; } i = 1; hlist_for_each_entry_safe(sdres, n, sdres_list, node) { pr_debug("uri: %s, sap: %d\n", sdres->uri, sdres->sap); uri_attr = nla_nest_start_noflag(msg, i++); if (uri_attr == NULL) { rc = -ENOMEM; goto nla_put_failure; } if (nla_put_u8(msg, NFC_SDP_ATTR_SAP, sdres->sap)) goto nla_put_failure; if (nla_put_string(msg, NFC_SDP_ATTR_URI, sdres->uri)) goto nla_put_failure; nla_nest_end(msg, uri_attr); hlist_del(&sdres->node); nfc_llcp_free_sdp_tlv(sdres); } nla_nest_end(msg, sdp_attr); genlmsg_end(msg, hdr); return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); nla_put_failure: free_msg: nlmsg_free(msg); nfc_llcp_free_sdp_tlv_list(sdres_list); return rc; } int nfc_genl_se_added(struct nfc_dev *dev, u32 se_idx, u16 type) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_SE_ADDED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) || nla_put_u8(msg, NFC_ATTR_SE_TYPE, type)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_se_removed(struct nfc_dev *dev, u32 se_idx) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_SE_REMOVED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_se_transaction(struct nfc_dev *dev, u8 se_idx, struct nfc_evt_transaction *evt_transaction) { struct nfc_se *se; struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_SE_TRANSACTION); if (!hdr) goto free_msg; se = nfc_find_se(dev, se_idx); if (!se) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) || nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type) || nla_put(msg, NFC_ATTR_SE_AID, evt_transaction->aid_len, evt_transaction->aid) || nla_put(msg, NFC_ATTR_SE_PARAMS, evt_transaction->params_len, evt_transaction->params)) goto nla_put_failure; /* evt_transaction is no more used */ devm_kfree(&dev->dev, evt_transaction); genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: /* evt_transaction is no more used */ devm_kfree(&dev->dev, evt_transaction); nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_se_connectivity(struct nfc_dev *dev, u8 se_idx) { const struct nfc_se *se; struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_SE_CONNECTIVITY); if (!hdr) goto free_msg; se = nfc_find_se(dev, se_idx); if (!se) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) || nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } static int nfc_genl_send_device(struct sk_buff *msg, struct nfc_dev *dev, u32 portid, u32 seq, struct netlink_callback *cb, int flags) { void *hdr; hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, flags, NFC_CMD_GET_DEVICE); if (!hdr) return -EMSGSIZE; if (cb) genl_dump_check_consistent(cb, hdr); if (nfc_genl_setup_device_added(dev, msg)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nfc_genl_dump_devices(struct sk_buff *skb, struct netlink_callback *cb) { struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; struct nfc_dev *dev = (struct nfc_dev *) cb->args[1]; bool first_call = false; if (!iter) { first_call = true; iter = kmalloc(sizeof(struct class_dev_iter), GFP_KERNEL); if (!iter) return -ENOMEM; cb->args[0] = (long) iter; } mutex_lock(&nfc_devlist_mutex); cb->seq = nfc_devlist_generation; if (first_call) { nfc_device_iter_init(iter); dev = nfc_device_iter_next(iter); } while (dev) { int rc; rc = nfc_genl_send_device(skb, dev, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb, NLM_F_MULTI); if (rc < 0) break; dev = nfc_device_iter_next(iter); } mutex_unlock(&nfc_devlist_mutex); cb->args[1] = (long) dev; return skb->len; } static int nfc_genl_dump_devices_done(struct netlink_callback *cb) { struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; if (iter) { nfc_device_iter_exit(iter); kfree(iter); } return 0; } int nfc_genl_dep_link_up_event(struct nfc_dev *dev, u32 target_idx, u8 comm_mode, u8 rf_mode) { struct sk_buff *msg; void *hdr; pr_debug("DEP link is up\n"); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_CMD_DEP_LINK_UP); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; if (rf_mode == NFC_RF_INITIATOR && nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target_idx)) goto nla_put_failure; if (nla_put_u8(msg, NFC_ATTR_COMM_MODE, comm_mode) || nla_put_u8(msg, NFC_ATTR_RF_MODE, rf_mode)) goto nla_put_failure; genlmsg_end(msg, hdr); dev->dep_link_up = true; genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_dep_link_down_event(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; pr_debug("DEP link is down\n"); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_CMD_DEP_LINK_DOWN); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } static int nfc_genl_get_device(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; struct nfc_dev *dev; u32 idx; int rc = -ENOBUFS; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { rc = -ENOMEM; goto out_putdev; } rc = nfc_genl_send_device(msg, dev, info->snd_portid, info->snd_seq, NULL, 0); if (rc < 0) goto out_free; nfc_put_device(dev); return genlmsg_reply(msg, info); out_free: nlmsg_free(msg); out_putdev: nfc_put_device(dev); return rc; } static int nfc_genl_dev_up(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_dev_up(dev); nfc_put_device(dev); return rc; } static int nfc_genl_dev_down(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_dev_down(dev); nfc_put_device(dev); return rc; } static int nfc_genl_start_poll(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; u32 im_protocols = 0, tm_protocols = 0; pr_debug("Poll start\n"); if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || ((!info->attrs[NFC_ATTR_IM_PROTOCOLS] && !info->attrs[NFC_ATTR_PROTOCOLS]) && !info->attrs[NFC_ATTR_TM_PROTOCOLS])) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); if (info->attrs[NFC_ATTR_TM_PROTOCOLS]) tm_protocols = nla_get_u32(info->attrs[NFC_ATTR_TM_PROTOCOLS]); if (info->attrs[NFC_ATTR_IM_PROTOCOLS]) im_protocols = nla_get_u32(info->attrs[NFC_ATTR_IM_PROTOCOLS]); else if (info->attrs[NFC_ATTR_PROTOCOLS]) im_protocols = nla_get_u32(info->attrs[NFC_ATTR_PROTOCOLS]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; mutex_lock(&dev->genl_data.genl_data_mutex); rc = nfc_start_poll(dev, im_protocols, tm_protocols); if (!rc) dev->genl_data.poll_req_portid = info->snd_portid; mutex_unlock(&dev->genl_data.genl_data_mutex); nfc_put_device(dev); return rc; } static int nfc_genl_stop_poll(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; device_lock(&dev->dev); if (!dev->polling) { device_unlock(&dev->dev); nfc_put_device(dev); return -EINVAL; } device_unlock(&dev->dev); mutex_lock(&dev->genl_data.genl_data_mutex); if (dev->genl_data.poll_req_portid != info->snd_portid) { rc = -EBUSY; goto out; } rc = nfc_stop_poll(dev); dev->genl_data.poll_req_portid = 0; out: mutex_unlock(&dev->genl_data.genl_data_mutex); nfc_put_device(dev); return rc; } static int nfc_genl_activate_target(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; u32 device_idx, target_idx, protocol; int rc; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_TARGET_INDEX] || !info->attrs[NFC_ATTR_PROTOCOLS]) return -EINVAL; device_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(device_idx); if (!dev) return -ENODEV; target_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]); protocol = nla_get_u32(info->attrs[NFC_ATTR_PROTOCOLS]); nfc_deactivate_target(dev, target_idx, NFC_TARGET_MODE_SLEEP); rc = nfc_activate_target(dev, target_idx, protocol); nfc_put_device(dev); return rc; } static int nfc_genl_deactivate_target(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; u32 device_idx, target_idx; int rc; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_TARGET_INDEX]) return -EINVAL; device_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(device_idx); if (!dev) return -ENODEV; target_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]); rc = nfc_deactivate_target(dev, target_idx, NFC_TARGET_MODE_SLEEP); nfc_put_device(dev); return rc; } static int nfc_genl_dep_link_up(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc, tgt_idx; u32 idx; u8 comm; pr_debug("DEP link up\n"); if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_COMM_MODE]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); if (!info->attrs[NFC_ATTR_TARGET_INDEX]) tgt_idx = NFC_TARGET_IDX_ANY; else tgt_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]); comm = nla_get_u8(info->attrs[NFC_ATTR_COMM_MODE]); if (comm != NFC_COMM_ACTIVE && comm != NFC_COMM_PASSIVE) return -EINVAL; dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_dep_link_up(dev, tgt_idx, comm); nfc_put_device(dev); return rc; } static int nfc_genl_dep_link_down(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_TARGET_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_dep_link_down(dev); nfc_put_device(dev); return rc; } static int nfc_genl_send_params(struct sk_buff *msg, struct nfc_llcp_local *local, u32 portid, u32 seq) { void *hdr; hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, 0, NFC_CMD_LLC_GET_PARAMS); if (!hdr) return -EMSGSIZE; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, local->dev->idx) || nla_put_u8(msg, NFC_ATTR_LLC_PARAM_LTO, local->lto) || nla_put_u8(msg, NFC_ATTR_LLC_PARAM_RW, local->rw) || nla_put_u16(msg, NFC_ATTR_LLC_PARAM_MIUX, be16_to_cpu(local->miux))) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nfc_genl_llc_get_params(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; struct nfc_llcp_local *local; int rc = 0; struct sk_buff *msg = NULL; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_FIRMWARE_NAME]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; device_lock(&dev->dev); local = nfc_llcp_find_local(dev); if (!local) { rc = -ENODEV; goto exit; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { rc = -ENOMEM; goto put_local; } rc = nfc_genl_send_params(msg, local, info->snd_portid, info->snd_seq); put_local: nfc_llcp_local_put(local); exit: device_unlock(&dev->dev); nfc_put_device(dev); if (rc < 0) { if (msg) nlmsg_free(msg); return rc; } return genlmsg_reply(msg, info); } static int nfc_genl_llc_set_params(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; struct nfc_llcp_local *local; u8 rw = 0; u16 miux = 0; u32 idx; int rc = 0; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || (!info->attrs[NFC_ATTR_LLC_PARAM_LTO] && !info->attrs[NFC_ATTR_LLC_PARAM_RW] && !info->attrs[NFC_ATTR_LLC_PARAM_MIUX])) return -EINVAL; if (info->attrs[NFC_ATTR_LLC_PARAM_RW]) { rw = nla_get_u8(info->attrs[NFC_ATTR_LLC_PARAM_RW]); if (rw > LLCP_MAX_RW) return -EINVAL; } if (info->attrs[NFC_ATTR_LLC_PARAM_MIUX]) { miux = nla_get_u16(info->attrs[NFC_ATTR_LLC_PARAM_MIUX]); if (miux > LLCP_MAX_MIUX) return -EINVAL; } idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; device_lock(&dev->dev); local = nfc_llcp_find_local(dev); if (!local) { rc = -ENODEV; goto exit; } if (info->attrs[NFC_ATTR_LLC_PARAM_LTO]) { if (dev->dep_link_up) { rc = -EINPROGRESS; goto put_local; } local->lto = nla_get_u8(info->attrs[NFC_ATTR_LLC_PARAM_LTO]); } if (info->attrs[NFC_ATTR_LLC_PARAM_RW]) local->rw = rw; if (info->attrs[NFC_ATTR_LLC_PARAM_MIUX]) local->miux = cpu_to_be16(miux); put_local: nfc_llcp_local_put(local); exit: device_unlock(&dev->dev); nfc_put_device(dev); return rc; } static int nfc_genl_llc_sdreq(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; struct nfc_llcp_local *local; struct nlattr *attr, *sdp_attrs[NFC_SDP_ATTR_MAX+1]; u32 idx; u8 tid; char *uri; int rc = 0, rem; size_t uri_len, tlvs_len; struct hlist_head sdreq_list; struct nfc_llcp_sdp_tlv *sdreq; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_LLC_SDP]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; device_lock(&dev->dev); if (dev->dep_link_up == false) { rc = -ENOLINK; goto exit; } local = nfc_llcp_find_local(dev); if (!local) { rc = -ENODEV; goto exit; } INIT_HLIST_HEAD(&sdreq_list); tlvs_len = 0; nla_for_each_nested(attr, info->attrs[NFC_ATTR_LLC_SDP], rem) { rc = nla_parse_nested_deprecated(sdp_attrs, NFC_SDP_ATTR_MAX, attr, nfc_sdp_genl_policy, info->extack); if (rc != 0) { rc = -EINVAL; goto put_local; } if (!sdp_attrs[NFC_SDP_ATTR_URI]) continue; uri_len = nla_len(sdp_attrs[NFC_SDP_ATTR_URI]); if (uri_len == 0) continue; uri = nla_data(sdp_attrs[NFC_SDP_ATTR_URI]); if (uri == NULL || *uri == 0) continue; tid = local->sdreq_next_tid++; sdreq = nfc_llcp_build_sdreq_tlv(tid, uri, uri_len); if (sdreq == NULL) { rc = -ENOMEM; goto put_local; } tlvs_len += sdreq->tlv_len; hlist_add_head(&sdreq->node, &sdreq_list); } if (hlist_empty(&sdreq_list)) { rc = -EINVAL; goto put_local; } rc = nfc_llcp_send_snl_sdreq(local, &sdreq_list, tlvs_len); put_local: nfc_llcp_local_put(local); exit: device_unlock(&dev->dev); nfc_put_device(dev); return rc; } static int nfc_genl_fw_download(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; char firmware_name[NFC_FIRMWARE_NAME_MAXSIZE + 1]; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_FIRMWARE_NAME]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; nla_strscpy(firmware_name, info->attrs[NFC_ATTR_FIRMWARE_NAME], sizeof(firmware_name)); rc = nfc_fw_download(dev, firmware_name); nfc_put_device(dev); return rc; } int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name, u32 result) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_CMD_FW_DOWNLOAD); if (!hdr) goto free_msg; if (nla_put_string(msg, NFC_ATTR_FIRMWARE_NAME, firmware_name) || nla_put_u32(msg, NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS, result) || nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } static int nfc_genl_enable_se(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx, se_idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_SE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); se_idx = nla_get_u32(info->attrs[NFC_ATTR_SE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_enable_se(dev, se_idx); nfc_put_device(dev); return rc; } static int nfc_genl_disable_se(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx, se_idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_SE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); se_idx = nla_get_u32(info->attrs[NFC_ATTR_SE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_disable_se(dev, se_idx); nfc_put_device(dev); return rc; } static int nfc_genl_send_se(struct sk_buff *msg, struct nfc_dev *dev, u32 portid, u32 seq, struct netlink_callback *cb, int flags) { void *hdr; struct nfc_se *se, *n; list_for_each_entry_safe(se, n, &dev->secure_elements, list) { hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, flags, NFC_CMD_GET_SE); if (!hdr) goto nla_put_failure; if (cb) genl_dump_check_consistent(cb, hdr); if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se->idx) || nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type)) goto nla_put_failure; genlmsg_end(msg, hdr); } return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nfc_genl_dump_ses(struct sk_buff *skb, struct netlink_callback *cb) { struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; struct nfc_dev *dev = (struct nfc_dev *) cb->args[1]; bool first_call = false; if (!iter) { first_call = true; iter = kmalloc(sizeof(struct class_dev_iter), GFP_KERNEL); if (!iter) return -ENOMEM; cb->args[0] = (long) iter; } mutex_lock(&nfc_devlist_mutex); cb->seq = nfc_devlist_generation; if (first_call) { nfc_device_iter_init(iter); dev = nfc_device_iter_next(iter); } while (dev) { int rc; rc = nfc_genl_send_se(skb, dev, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb, NLM_F_MULTI); if (rc < 0) break; dev = nfc_device_iter_next(iter); } mutex_unlock(&nfc_devlist_mutex); cb->args[1] = (long) dev; return skb->len; } static int nfc_genl_dump_ses_done(struct netlink_callback *cb) { struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; if (iter) { nfc_device_iter_exit(iter); kfree(iter); } return 0; } static int nfc_se_io(struct nfc_dev *dev, u32 se_idx, u8 *apdu, size_t apdu_length, se_io_cb_t cb, void *cb_context) { struct nfc_se *se; int rc; pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx); device_lock(&dev->dev); if (!device_is_registered(&dev->dev)) { rc = -ENODEV; goto error; } if (!dev->dev_up) { rc = -ENODEV; goto error; } if (!dev->ops->se_io) { rc = -EOPNOTSUPP; goto error; } se = nfc_find_se(dev, se_idx); if (!se) { rc = -EINVAL; goto error; } if (se->state != NFC_SE_ENABLED) { rc = -ENODEV; goto error; } rc = dev->ops->se_io(dev, se_idx, apdu, apdu_length, cb, cb_context); device_unlock(&dev->dev); return rc; error: device_unlock(&dev->dev); kfree(cb_context); return rc; } struct se_io_ctx { u32 dev_idx; u32 se_idx; }; static void se_io_cb(void *context, u8 *apdu, size_t apdu_len, int err) { struct se_io_ctx *ctx = context; struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { kfree(ctx); return; } hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_CMD_SE_IO); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, ctx->dev_idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, ctx->se_idx) || nla_put(msg, NFC_ATTR_SE_APDU, apdu_len, apdu)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); kfree(ctx); return; nla_put_failure: free_msg: nlmsg_free(msg); kfree(ctx); return; } static int nfc_genl_se_io(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; struct se_io_ctx *ctx; u32 dev_idx, se_idx; u8 *apdu; size_t apdu_len; int rc; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_SE_INDEX] || !info->attrs[NFC_ATTR_SE_APDU]) return -EINVAL; dev_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); se_idx = nla_get_u32(info->attrs[NFC_ATTR_SE_INDEX]); dev = nfc_get_device(dev_idx); if (!dev) return -ENODEV; if (!dev->ops || !dev->ops->se_io) { rc = -EOPNOTSUPP; goto put_dev; } apdu_len = nla_len(info->attrs[NFC_ATTR_SE_APDU]); if (apdu_len == 0) { rc = -EINVAL; goto put_dev; } apdu = nla_data(info->attrs[NFC_ATTR_SE_APDU]); if (!apdu) { rc = -EINVAL; goto put_dev; } ctx = kzalloc(sizeof(struct se_io_ctx), GFP_KERNEL); if (!ctx) { rc = -ENOMEM; goto put_dev; } ctx->dev_idx = dev_idx; ctx->se_idx = se_idx; rc = nfc_se_io(dev, se_idx, apdu, apdu_len, se_io_cb, ctx); put_dev: nfc_put_device(dev); return rc; } static int nfc_genl_vendor_cmd(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; const struct nfc_vendor_cmd *cmd; u32 dev_idx, vid, subcmd; u8 *data; size_t data_len; int i, err; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_VENDOR_ID] || !info->attrs[NFC_ATTR_VENDOR_SUBCMD]) return -EINVAL; dev_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); vid = nla_get_u32(info->attrs[NFC_ATTR_VENDOR_ID]); subcmd = nla_get_u32(info->attrs[NFC_ATTR_VENDOR_SUBCMD]); dev = nfc_get_device(dev_idx); if (!dev) return -ENODEV; if (!dev->vendor_cmds || !dev->n_vendor_cmds) { err = -ENODEV; goto put_dev; } if (info->attrs[NFC_ATTR_VENDOR_DATA]) { data = nla_data(info->attrs[NFC_ATTR_VENDOR_DATA]); data_len = nla_len(info->attrs[NFC_ATTR_VENDOR_DATA]); if (data_len == 0) { err = -EINVAL; goto put_dev; } } else { data = NULL; data_len = 0; } for (i = 0; i < dev->n_vendor_cmds; i++) { cmd = &dev->vendor_cmds[i]; if (cmd->vendor_id != vid || cmd->subcmd != subcmd) continue; dev->cur_cmd_info = info; err = cmd->doit(dev, data, data_len); dev->cur_cmd_info = NULL; goto put_dev; } err = -EOPNOTSUPP; put_dev: nfc_put_device(dev); return err; } /* message building helper */ static inline void *nfc_hdr_put(struct sk_buff *skb, u32 portid, u32 seq, int flags, u8 cmd) { /* since there is no private header just add the generic one */ return genlmsg_put(skb, portid, seq, &nfc_genl_family, flags, cmd); } static struct sk_buff * __nfc_alloc_vendor_cmd_skb(struct nfc_dev *dev, int approxlen, u32 portid, u32 seq, enum nfc_attrs attr, u32 oui, u32 subcmd, gfp_t gfp) { struct sk_buff *skb; void *hdr; skb = nlmsg_new(approxlen + 100, gfp); if (!skb) return NULL; hdr = nfc_hdr_put(skb, portid, seq, 0, NFC_CMD_VENDOR); if (!hdr) { kfree_skb(skb); return NULL; } if (nla_put_u32(skb, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; if (nla_put_u32(skb, NFC_ATTR_VENDOR_ID, oui)) goto nla_put_failure; if (nla_put_u32(skb, NFC_ATTR_VENDOR_SUBCMD, subcmd)) goto nla_put_failure; ((void **)skb->cb)[0] = dev; ((void **)skb->cb)[1] = hdr; return skb; nla_put_failure: kfree_skb(skb); return NULL; } struct sk_buff *__nfc_alloc_vendor_cmd_reply_skb(struct nfc_dev *dev, enum nfc_attrs attr, u32 oui, u32 subcmd, int approxlen) { if (WARN_ON(!dev->cur_cmd_info)) return NULL; return __nfc_alloc_vendor_cmd_skb(dev, approxlen, dev->cur_cmd_info->snd_portid, dev->cur_cmd_info->snd_seq, attr, oui, subcmd, GFP_KERNEL); } EXPORT_SYMBOL(__nfc_alloc_vendor_cmd_reply_skb); int nfc_vendor_cmd_reply(struct sk_buff *skb) { struct nfc_dev *dev = ((void **)skb->cb)[0]; void *hdr = ((void **)skb->cb)[1]; /* clear CB data for netlink core to own from now on */ memset(skb->cb, 0, sizeof(skb->cb)); if (WARN_ON(!dev->cur_cmd_info)) { kfree_skb(skb); return -EINVAL; } genlmsg_end(skb, hdr); return genlmsg_reply(skb, dev->cur_cmd_info); } EXPORT_SYMBOL(nfc_vendor_cmd_reply); static const struct genl_ops nfc_genl_ops[] = { { .cmd = NFC_CMD_GET_DEVICE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_get_device, .dumpit = nfc_genl_dump_devices, .done = nfc_genl_dump_devices_done, }, { .cmd = NFC_CMD_DEV_UP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dev_up, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEV_DOWN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dev_down, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_START_POLL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_start_poll, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_STOP_POLL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_stop_poll, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEP_LINK_UP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dep_link_up, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEP_LINK_DOWN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dep_link_down, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_GET_TARGET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = nfc_genl_dump_targets, .done = nfc_genl_dump_targets_done, }, { .cmd = NFC_CMD_LLC_GET_PARAMS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_get_params, }, { .cmd = NFC_CMD_LLC_SET_PARAMS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_set_params, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_LLC_SDREQ, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_sdreq, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_FW_DOWNLOAD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_fw_download, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_ENABLE_SE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_enable_se, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DISABLE_SE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_disable_se, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_GET_SE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .dumpit = nfc_genl_dump_ses, .done = nfc_genl_dump_ses_done, }, { .cmd = NFC_CMD_SE_IO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_se_io, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_ACTIVATE_TARGET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_activate_target, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_VENDOR, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_vendor_cmd, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEACTIVATE_TARGET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_deactivate_target, .flags = GENL_ADMIN_PERM, }, }; static struct genl_family nfc_genl_family __ro_after_init = { .hdrsize = 0, .name = NFC_GENL_NAME, .version = NFC_GENL_VERSION, .maxattr = NFC_ATTR_MAX, .policy = nfc_genl_policy, .module = THIS_MODULE, .ops = nfc_genl_ops, .n_ops = ARRAY_SIZE(nfc_genl_ops), .resv_start_op = NFC_CMD_DEACTIVATE_TARGET + 1, .mcgrps = nfc_genl_mcgrps, .n_mcgrps = ARRAY_SIZE(nfc_genl_mcgrps), }; struct urelease_work { struct work_struct w; u32 portid; }; static void nfc_urelease_event_work(struct work_struct *work) { struct urelease_work *w = container_of(work, struct urelease_work, w); struct class_dev_iter iter; struct nfc_dev *dev; pr_debug("portid %d\n", w->portid); mutex_lock(&nfc_devlist_mutex); nfc_device_iter_init(&iter); dev = nfc_device_iter_next(&iter); while (dev) { mutex_lock(&dev->genl_data.genl_data_mutex); if (dev->genl_data.poll_req_portid == w->portid) { nfc_stop_poll(dev); dev->genl_data.poll_req_portid = 0; } mutex_unlock(&dev->genl_data.genl_data_mutex); dev = nfc_device_iter_next(&iter); } nfc_device_iter_exit(&iter); mutex_unlock(&nfc_devlist_mutex); kfree(w); } static int nfc_genl_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netlink_notify *n = ptr; struct urelease_work *w; if (event != NETLINK_URELEASE || n->protocol != NETLINK_GENERIC) goto out; pr_debug("NETLINK_URELEASE event from id %d\n", n->portid); w = kmalloc(sizeof(*w), GFP_ATOMIC); if (w) { INIT_WORK(&w->w, nfc_urelease_event_work); w->portid = n->portid; schedule_work(&w->w); } out: return NOTIFY_DONE; } void nfc_genl_data_init(struct nfc_genl_data *genl_data) { genl_data->poll_req_portid = 0; mutex_init(&genl_data->genl_data_mutex); } void nfc_genl_data_exit(struct nfc_genl_data *genl_data) { mutex_destroy(&genl_data->genl_data_mutex); } static struct notifier_block nl_notifier = { .notifier_call = nfc_genl_rcv_nl_event, }; /** * nfc_genl_init() - Initialize netlink interface * * This initialization function registers the nfc netlink family. */ int __init nfc_genl_init(void) { int rc; rc = genl_register_family(&nfc_genl_family); if (rc) return rc; netlink_register_notifier(&nl_notifier); return 0; } /** * nfc_genl_exit() - Deinitialize netlink interface * * This exit function unregisters the nfc netlink family. */ void nfc_genl_exit(void) { netlink_unregister_notifier(&nl_notifier); genl_unregister_family(&nfc_genl_family); } |
2296 2287 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 | // SPDX-License-Identifier: GPL-2.0-only /* * Interface handling * * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz> * Copyright 2008, Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (c) 2016 Intel Deutschland GmbH * Copyright (C) 2018-2022 Intel Corporation */ #include <linux/slab.h> #include <linux/kernel.h> #include <linux/if_arp.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/kcov.h> #include <net/mac80211.h> #include <net/ieee80211_radiotap.h> #include "ieee80211_i.h" #include "sta_info.h" #include "debugfs_netdev.h" #include "mesh.h" #include "led.h" #include "driver-ops.h" #include "wme.h" #include "rate.h" /** * DOC: Interface list locking * * The interface list in each struct ieee80211_local is protected * three-fold: * * (1) modifications may only be done under the RTNL * (2) modifications and readers are protected against each other by * the iflist_mtx. * (3) modifications are done in an RCU manner so atomic readers * can traverse the list in RCU-safe blocks. * * As a consequence, reads (traversals) of the list can be protected * by either the RTNL, the iflist_mtx or RCU. */ static void ieee80211_iface_work(struct work_struct *work); bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata) { struct ieee80211_chanctx_conf *chanctx_conf; int power; rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (!chanctx_conf) { rcu_read_unlock(); return false; } power = ieee80211_chandef_max_power(&chanctx_conf->def); rcu_read_unlock(); if (sdata->deflink.user_power_level != IEEE80211_UNSET_POWER_LEVEL) power = min(power, sdata->deflink.user_power_level); if (sdata->deflink.ap_power_level != IEEE80211_UNSET_POWER_LEVEL) power = min(power, sdata->deflink.ap_power_level); if (power != sdata->vif.bss_conf.txpower) { sdata->vif.bss_conf.txpower = power; ieee80211_hw_config(sdata->local, 0); return true; } return false; } void ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata, bool update_bss) { if (__ieee80211_recalc_txpower(sdata) || (update_bss && ieee80211_sdata_running(sdata))) ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_TXPOWER); } static u32 __ieee80211_idle_off(struct ieee80211_local *local) { if (!(local->hw.conf.flags & IEEE80211_CONF_IDLE)) return 0; local->hw.conf.flags &= ~IEEE80211_CONF_IDLE; return IEEE80211_CONF_CHANGE_IDLE; } static u32 __ieee80211_idle_on(struct ieee80211_local *local) { if (local->hw.conf.flags & IEEE80211_CONF_IDLE) return 0; ieee80211_flush_queues(local, NULL, false); local->hw.conf.flags |= IEEE80211_CONF_IDLE; return IEEE80211_CONF_CHANGE_IDLE; } static u32 __ieee80211_recalc_idle(struct ieee80211_local *local, bool force_active) { bool working, scanning, active; unsigned int led_trig_start = 0, led_trig_stop = 0; lockdep_assert_held(&local->mtx); active = force_active || !list_empty(&local->chanctx_list) || local->monitors; working = !local->ops->remain_on_channel && !list_empty(&local->roc_list); scanning = test_bit(SCAN_SW_SCANNING, &local->scanning) || test_bit(SCAN_ONCHANNEL_SCANNING, &local->scanning); if (working || scanning) led_trig_start |= IEEE80211_TPT_LEDTRIG_FL_WORK; else led_trig_stop |= IEEE80211_TPT_LEDTRIG_FL_WORK; if (active) led_trig_start |= IEEE80211_TPT_LEDTRIG_FL_CONNECTED; else led_trig_stop |= IEEE80211_TPT_LEDTRIG_FL_CONNECTED; ieee80211_mod_tpt_led_trig(local, led_trig_start, led_trig_stop); if (working || scanning || active) return __ieee80211_idle_off(local); return __ieee80211_idle_on(local); } u32 ieee80211_idle_off(struct ieee80211_local *local) { return __ieee80211_recalc_idle(local, true); } void ieee80211_recalc_idle(struct ieee80211_local *local) { u32 change = __ieee80211_recalc_idle(local, false); if (change) ieee80211_hw_config(local, change); } static int ieee80211_verify_mac(struct ieee80211_sub_if_data *sdata, u8 *addr, bool check_dup) { struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *iter; u64 new, mask, tmp; u8 *m; int ret = 0; if (is_zero_ether_addr(local->hw.wiphy->addr_mask)) return 0; m = addr; new = ((u64)m[0] << 5*8) | ((u64)m[1] << 4*8) | ((u64)m[2] << 3*8) | ((u64)m[3] << 2*8) | ((u64)m[4] << 1*8) | ((u64)m[5] << 0*8); m = local->hw.wiphy->addr_mask; mask = ((u64)m[0] << 5*8) | ((u64)m[1] << 4*8) | ((u64)m[2] << 3*8) | ((u64)m[3] << 2*8) | ((u64)m[4] << 1*8) | ((u64)m[5] << 0*8); if (!check_dup) return ret; mutex_lock(&local->iflist_mtx); list_for_each_entry(iter, &local->interfaces, list) { if (iter == sdata) continue; if (iter->vif.type == NL80211_IFTYPE_MONITOR && !(iter->u.mntr.flags & MONITOR_FLAG_ACTIVE)) continue; m = iter->vif.addr; tmp = ((u64)m[0] << 5*8) | ((u64)m[1] << 4*8) | ((u64)m[2] << 3*8) | ((u64)m[3] << 2*8) | ((u64)m[4] << 1*8) | ((u64)m[5] << 0*8); if ((new & ~mask) != (tmp & ~mask)) { ret = -EINVAL; break; } } mutex_unlock(&local->iflist_mtx); return ret; } static int ieee80211_can_powered_addr_change(struct ieee80211_sub_if_data *sdata) { struct ieee80211_roc_work *roc; struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *scan_sdata; int ret = 0; /* To be the most flexible here we want to only limit changing the * address if the specific interface is doing offchannel work or * scanning. */ if (netif_carrier_ok(sdata->dev)) return -EBUSY; mutex_lock(&local->mtx); /* First check no ROC work is happening on this iface */ list_for_each_entry(roc, &local->roc_list, list) { if (roc->sdata != sdata) continue; if (roc->started) { ret = -EBUSY; goto unlock; } } /* And if this iface is scanning */ if (local->scanning) { scan_sdata = rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->mtx)); if (sdata == scan_sdata) ret = -EBUSY; } switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: /* More interface types could be added here but changing the * address while powered makes the most sense in client modes. */ break; default: ret = -EOPNOTSUPP; } unlock: mutex_unlock(&local->mtx); return ret; } static int _ieee80211_change_mac(struct ieee80211_sub_if_data *sdata, void *addr) { struct ieee80211_local *local = sdata->local; struct sockaddr *sa = addr; bool check_dup = true; bool live = false; int ret; if (ieee80211_sdata_running(sdata)) { ret = ieee80211_can_powered_addr_change(sdata); if (ret) return ret; live = true; } if (sdata->vif.type == NL80211_IFTYPE_MONITOR && !(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE)) check_dup = false; ret = ieee80211_verify_mac(sdata, sa->sa_data, check_dup); if (ret) return ret; if (live) drv_remove_interface(local, sdata); ret = eth_mac_addr(sdata->dev, sa); if (ret == 0) { memcpy(sdata->vif.addr, sa->sa_data, ETH_ALEN); ether_addr_copy(sdata->vif.bss_conf.addr, sdata->vif.addr); } /* Regardless of eth_mac_addr() return we still want to add the * interface back. This should not fail... */ if (live) WARN_ON(drv_add_interface(local, sdata)); return ret; } static int ieee80211_change_mac(struct net_device *dev, void *addr) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; int ret; /* * This happens during unregistration if there's a bond device * active (maybe other cases?) and we must get removed from it. * But we really don't care anymore if it's not registered now. */ if (!dev->ieee80211_ptr->registered) return 0; wiphy_lock(local->hw.wiphy); ret = _ieee80211_change_mac(sdata, addr); wiphy_unlock(local->hw.wiphy); return ret; } static inline int identical_mac_addr_allowed(int type1, int type2) { return type1 == NL80211_IFTYPE_MONITOR || type2 == NL80211_IFTYPE_MONITOR || type1 == NL80211_IFTYPE_P2P_DEVICE || type2 == NL80211_IFTYPE_P2P_DEVICE || (type1 == NL80211_IFTYPE_AP && type2 == NL80211_IFTYPE_AP_VLAN) || (type1 == NL80211_IFTYPE_AP_VLAN && (type2 == NL80211_IFTYPE_AP || type2 == NL80211_IFTYPE_AP_VLAN)); } static int ieee80211_check_concurrent_iface(struct ieee80211_sub_if_data *sdata, enum nl80211_iftype iftype) { struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *nsdata; int ret; ASSERT_RTNL(); /* we hold the RTNL here so can safely walk the list */ list_for_each_entry(nsdata, &local->interfaces, list) { if (nsdata != sdata && ieee80211_sdata_running(nsdata)) { /* * Only OCB and monitor mode may coexist */ if ((sdata->vif.type == NL80211_IFTYPE_OCB && nsdata->vif.type != NL80211_IFTYPE_MONITOR) || (sdata->vif.type != NL80211_IFTYPE_MONITOR && nsdata->vif.type == NL80211_IFTYPE_OCB)) return -EBUSY; /* * Allow only a single IBSS interface to be up at any * time. This is restricted because beacon distribution * cannot work properly if both are in the same IBSS. * * To remove this restriction we'd have to disallow them * from setting the same SSID on different IBSS interfaces * belonging to the same hardware. Then, however, we're * faced with having to adopt two different TSF timers... */ if (iftype == NL80211_IFTYPE_ADHOC && nsdata->vif.type == NL80211_IFTYPE_ADHOC) return -EBUSY; /* * will not add another interface while any channel * switch is active. */ if (nsdata->vif.bss_conf.csa_active) return -EBUSY; /* * The remaining checks are only performed for interfaces * with the same MAC address. */ if (!ether_addr_equal(sdata->vif.addr, nsdata->vif.addr)) continue; /* * check whether it may have the same address */ if (!identical_mac_addr_allowed(iftype, nsdata->vif.type)) return -ENOTUNIQ; /* No support for VLAN with MLO yet */ if (iftype == NL80211_IFTYPE_AP_VLAN && sdata->wdev.use_4addr && nsdata->vif.type == NL80211_IFTYPE_AP && nsdata->vif.valid_links) return -EOPNOTSUPP; /* * can only add VLANs to enabled APs */ if (iftype == NL80211_IFTYPE_AP_VLAN && nsdata->vif.type == NL80211_IFTYPE_AP) sdata->bss = &nsdata->u.ap; } } mutex_lock(&local->chanctx_mtx); ret = ieee80211_check_combinations(sdata, NULL, 0, 0); mutex_unlock(&local->chanctx_mtx); return ret; } static int ieee80211_check_queues(struct ieee80211_sub_if_data *sdata, enum nl80211_iftype iftype) { int n_queues = sdata->local->hw.queues; int i; if (iftype == NL80211_IFTYPE_NAN) return 0; if (iftype != NL80211_IFTYPE_P2P_DEVICE) { for (i = 0; i < IEEE80211_NUM_ACS; i++) { if (WARN_ON_ONCE(sdata->vif.hw_queue[i] == IEEE80211_INVAL_HW_QUEUE)) return -EINVAL; if (WARN_ON_ONCE(sdata->vif.hw_queue[i] >= n_queues)) return -EINVAL; } } if ((iftype != NL80211_IFTYPE_AP && iftype != NL80211_IFTYPE_P2P_GO && iftype != NL80211_IFTYPE_MESH_POINT) || !ieee80211_hw_check(&sdata->local->hw, QUEUE_CONTROL)) { sdata->vif.cab_queue = IEEE80211_INVAL_HW_QUEUE; return 0; } if (WARN_ON_ONCE(sdata->vif.cab_queue == IEEE80211_INVAL_HW_QUEUE)) return -EINVAL; if (WARN_ON_ONCE(sdata->vif.cab_queue >= n_queues)) return -EINVAL; return 0; } static int ieee80211_open(struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); int err; /* fail early if user set an invalid address */ if (!is_valid_ether_addr(dev->dev_addr)) return -EADDRNOTAVAIL; err = ieee80211_check_concurrent_iface(sdata, sdata->vif.type); if (err) return err; wiphy_lock(sdata->local->hw.wiphy); err = ieee80211_do_open(&sdata->wdev, true); wiphy_unlock(sdata->local->hw.wiphy); return err; } static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_down) { struct ieee80211_local *local = sdata->local; unsigned long flags; struct sk_buff_head freeq; struct sk_buff *skb, *tmp; u32 hw_reconf_flags = 0; int i, flushed; struct ps_data *ps; struct cfg80211_chan_def chandef; bool cancel_scan; struct cfg80211_nan_func *func; clear_bit(SDATA_STATE_RUNNING, &sdata->state); synchronize_rcu(); /* flush _ieee80211_wake_txqs() */ cancel_scan = rcu_access_pointer(local->scan_sdata) == sdata; if (cancel_scan) ieee80211_scan_cancel(local); /* * Stop TX on this interface first. */ if (!local->ops->wake_tx_queue && sdata->dev) netif_tx_stop_all_queues(sdata->dev); ieee80211_roc_purge(local, sdata); switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: ieee80211_mgd_stop(sdata); break; case NL80211_IFTYPE_ADHOC: ieee80211_ibss_stop(sdata); break; case NL80211_IFTYPE_MONITOR: if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) break; list_del_rcu(&sdata->u.mntr.list); break; default: break; } /* * Remove all stations associated with this interface. * * This must be done before calling ops->remove_interface() * because otherwise we can later invoke ops->sta_notify() * whenever the STAs are removed, and that invalidates driver * assumptions about always getting a vif pointer that is valid * (because if we remove a STA after ops->remove_interface() * the driver will have removed the vif info already!) * * For AP_VLANs stations may exist since there's nothing else that * would have removed them, but in other modes there shouldn't * be any stations. */ flushed = sta_info_flush(sdata); WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_AP_VLAN && flushed > 0); /* don't count this interface for allmulti while it is down */ if (sdata->flags & IEEE80211_SDATA_ALLMULTI) atomic_dec(&local->iff_allmultis); if (sdata->vif.type == NL80211_IFTYPE_AP) { local->fif_pspoll--; local->fif_probe_req--; } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { local->fif_probe_req--; } if (sdata->dev) { netif_addr_lock_bh(sdata->dev); spin_lock_bh(&local->filter_lock); __hw_addr_unsync(&local->mc_list, &sdata->dev->mc, sdata->dev->addr_len); spin_unlock_bh(&local->filter_lock); netif_addr_unlock_bh(sdata->dev); } del_timer_sync(&local->dynamic_ps_timer); cancel_work_sync(&local->dynamic_ps_enable_work); cancel_work_sync(&sdata->recalc_smps); sdata_lock(sdata); WARN(sdata->vif.valid_links, "destroying interface with valid links 0x%04x\n", sdata->vif.valid_links); mutex_lock(&local->mtx); sdata->vif.bss_conf.csa_active = false; if (sdata->vif.type == NL80211_IFTYPE_STATION) sdata->deflink.u.mgd.csa_waiting_bcn = false; if (sdata->deflink.csa_block_tx) { ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); sdata->deflink.csa_block_tx = false; } mutex_unlock(&local->mtx); sdata_unlock(sdata); cancel_work_sync(&sdata->deflink.csa_finalize_work); cancel_work_sync(&sdata->deflink.color_change_finalize_work); cancel_delayed_work_sync(&sdata->deflink.dfs_cac_timer_work); if (sdata->wdev.cac_started) { chandef = sdata->vif.bss_conf.chandef; WARN_ON(local->suspended); mutex_lock(&local->mtx); ieee80211_link_release_channel(&sdata->deflink); mutex_unlock(&local->mtx); cfg80211_cac_event(sdata->dev, &chandef, NL80211_RADAR_CAC_ABORTED, GFP_KERNEL); } if (sdata->vif.type == NL80211_IFTYPE_AP) { WARN_ON(!list_empty(&sdata->u.ap.vlans)); } else if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { /* remove all packets in parent bc_buf pointing to this dev */ ps = &sdata->bss->ps; spin_lock_irqsave(&ps->bc_buf.lock, flags); skb_queue_walk_safe(&ps->bc_buf, skb, tmp) { if (skb->dev == sdata->dev) { __skb_unlink(skb, &ps->bc_buf); local->total_ps_buffered--; ieee80211_free_txskb(&local->hw, skb); } } spin_unlock_irqrestore(&ps->bc_buf.lock, flags); } if (going_down) local->open_count--; switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: mutex_lock(&local->mtx); list_del(&sdata->u.vlan.list); mutex_unlock(&local->mtx); RCU_INIT_POINTER(sdata->vif.bss_conf.chanctx_conf, NULL); /* see comment in the default case below */ ieee80211_free_keys(sdata, true); /* no need to tell driver */ break; case NL80211_IFTYPE_MONITOR: if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) { local->cooked_mntrs--; break; } local->monitors--; if (local->monitors == 0) { local->hw.conf.flags &= ~IEEE80211_CONF_MONITOR; hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR; } ieee80211_adjust_monitor_flags(sdata, -1); break; case NL80211_IFTYPE_NAN: /* clean all the functions */ spin_lock_bh(&sdata->u.nan.func_lock); idr_for_each_entry(&sdata->u.nan.function_inst_ids, func, i) { idr_remove(&sdata->u.nan.function_inst_ids, i); cfg80211_free_nan_func(func); } idr_destroy(&sdata->u.nan.function_inst_ids); spin_unlock_bh(&sdata->u.nan.func_lock); break; case NL80211_IFTYPE_P2P_DEVICE: /* relies on synchronize_rcu() below */ RCU_INIT_POINTER(local->p2p_sdata, NULL); fallthrough; default: cancel_work_sync(&sdata->work); /* * When we get here, the interface is marked down. * Free the remaining keys, if there are any * (which can happen in AP mode if userspace sets * keys before the interface is operating) * * Force the key freeing to always synchronize_net() * to wait for the RX path in case it is using this * interface enqueuing frames at this very time on * another CPU. */ ieee80211_free_keys(sdata, true); skb_queue_purge(&sdata->skb_queue); skb_queue_purge(&sdata->status_queue); } /* * Since ieee80211_free_txskb() may issue __dev_queue_xmit() * which should be called with interrupts enabled, reclamation * is done in two phases: */ __skb_queue_head_init(&freeq); /* unlink from local queues... */ spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for (i = 0; i < IEEE80211_MAX_QUEUES; i++) { skb_queue_walk_safe(&local->pending[i], skb, tmp) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); if (info->control.vif == &sdata->vif) { __skb_unlink(skb, &local->pending[i]); __skb_queue_tail(&freeq, skb); } } } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); /* ... and perform actual reclamation with interrupts enabled. */ skb_queue_walk_safe(&freeq, skb, tmp) { __skb_unlink(skb, &freeq); ieee80211_free_txskb(&local->hw, skb); } if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) ieee80211_txq_remove_vlan(local, sdata); sdata->bss = NULL; if (local->open_count == 0) ieee80211_clear_tx_pending(local); sdata->vif.bss_conf.beacon_int = 0; /* * If the interface goes down while suspended, presumably because * the device was unplugged and that happens before our resume, * then the driver is already unconfigured and the remainder of * this function isn't needed. * XXX: what about WoWLAN? If the device has software state, e.g. * memory allocated, it might expect teardown commands from * mac80211 here? */ if (local->suspended) { WARN_ON(local->wowlan); WARN_ON(rcu_access_pointer(local->monitor_sdata)); return; } switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: break; case NL80211_IFTYPE_MONITOR: if (local->monitors == 0) ieee80211_del_virtual_monitor(local); mutex_lock(&local->mtx); ieee80211_recalc_idle(local); mutex_unlock(&local->mtx); if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE)) break; fallthrough; default: if (going_down) drv_remove_interface(local, sdata); } ieee80211_recalc_ps(local); if (cancel_scan) wiphy_delayed_work_flush(local->hw.wiphy, &local->scan_work); if (local->open_count == 0) { ieee80211_stop_device(local); /* no reconfiguring after stop! */ return; } /* do after stop to avoid reconfiguring when we stop anyway */ ieee80211_configure_filter(local); ieee80211_hw_config(local, hw_reconf_flags); if (local->monitors == local->open_count) ieee80211_add_virtual_monitor(local); } static void ieee80211_stop_mbssid(struct ieee80211_sub_if_data *sdata) { struct ieee80211_sub_if_data *tx_sdata, *non_tx_sdata, *tmp_sdata; struct ieee80211_vif *tx_vif = sdata->vif.mbssid_tx_vif; if (!tx_vif) return; tx_sdata = vif_to_sdata(tx_vif); sdata->vif.mbssid_tx_vif = NULL; list_for_each_entry_safe(non_tx_sdata, tmp_sdata, &tx_sdata->local->interfaces, list) { if (non_tx_sdata != sdata && non_tx_sdata != tx_sdata && non_tx_sdata->vif.mbssid_tx_vif == tx_vif && ieee80211_sdata_running(non_tx_sdata)) { non_tx_sdata->vif.mbssid_tx_vif = NULL; dev_close(non_tx_sdata->wdev.netdev); } } if (sdata != tx_sdata && ieee80211_sdata_running(tx_sdata)) { tx_sdata->vif.mbssid_tx_vif = NULL; dev_close(tx_sdata->wdev.netdev); } } static int ieee80211_stop(struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); /* close dependent VLAN and MBSSID interfaces before locking wiphy */ if (sdata->vif.type == NL80211_IFTYPE_AP) { struct ieee80211_sub_if_data *vlan, *tmpsdata; list_for_each_entry_safe(vlan, tmpsdata, &sdata->u.ap.vlans, u.vlan.list) dev_close(vlan->dev); ieee80211_stop_mbssid(sdata); } cancel_work_sync(&sdata->activate_links_work); wiphy_lock(sdata->local->hw.wiphy); ieee80211_do_stop(sdata, true); wiphy_unlock(sdata->local->hw.wiphy); return 0; } static void ieee80211_set_multicast_list(struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; int allmulti, sdata_allmulti; allmulti = !!(dev->flags & IFF_ALLMULTI); sdata_allmulti = !!(sdata->flags & IEEE80211_SDATA_ALLMULTI); if (allmulti != sdata_allmulti) { if (dev->flags & IFF_ALLMULTI) atomic_inc(&local->iff_allmultis); else atomic_dec(&local->iff_allmultis); sdata->flags ^= IEEE80211_SDATA_ALLMULTI; } spin_lock_bh(&local->filter_lock); __hw_addr_sync(&local->mc_list, &dev->mc, dev->addr_len); spin_unlock_bh(&local->filter_lock); ieee80211_queue_work(&local->hw, &local->reconfig_filter); } /* * Called when the netdev is removed or, by the code below, before * the interface type changes. */ static void ieee80211_teardown_sdata(struct ieee80211_sub_if_data *sdata) { /* free extra data */ ieee80211_free_keys(sdata, false); ieee80211_debugfs_remove_netdev(sdata); ieee80211_destroy_frag_cache(&sdata->frags); if (ieee80211_vif_is_mesh(&sdata->vif)) ieee80211_mesh_teardown_sdata(sdata); ieee80211_vif_clear_links(sdata); ieee80211_link_stop(&sdata->deflink); } static void ieee80211_uninit(struct net_device *dev) { ieee80211_teardown_sdata(IEEE80211_DEV_TO_SUB_IF(dev)); } static u16 ieee80211_netdev_select_queue(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb); } static void ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { dev_fetch_sw_netstats(stats, dev->tstats); } static const struct net_device_ops ieee80211_dataif_ops = { .ndo_open = ieee80211_open, .ndo_stop = ieee80211_stop, .ndo_uninit = ieee80211_uninit, .ndo_start_xmit = ieee80211_subif_start_xmit, .ndo_set_rx_mode = ieee80211_set_multicast_list, .ndo_set_mac_address = ieee80211_change_mac, .ndo_select_queue = ieee80211_netdev_select_queue, .ndo_get_stats64 = ieee80211_get_stats64, }; static u16 ieee80211_monitor_select_queue(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr; int len_rthdr; if (local->hw.queues < IEEE80211_NUM_ACS) return 0; /* reset flags and info before parsing radiotap header */ memset(info, 0, sizeof(*info)); if (!ieee80211_parse_tx_radiotap(skb, dev)) return 0; /* doesn't matter, frame will be dropped */ len_rthdr = ieee80211_get_radiotap_len(skb->data); hdr = (struct ieee80211_hdr *)(skb->data + len_rthdr); if (skb->len < len_rthdr + 2 || skb->len < len_rthdr + ieee80211_hdrlen(hdr->frame_control)) return 0; /* doesn't matter, frame will be dropped */ return ieee80211_select_queue_80211(sdata, skb, hdr); } static const struct net_device_ops ieee80211_monitorif_ops = { .ndo_open = ieee80211_open, .ndo_stop = ieee80211_stop, .ndo_uninit = ieee80211_uninit, .ndo_start_xmit = ieee80211_monitor_start_xmit, .ndo_set_rx_mode = ieee80211_set_multicast_list, .ndo_set_mac_address = ieee80211_change_mac, .ndo_select_queue = ieee80211_monitor_select_queue, .ndo_get_stats64 = ieee80211_get_stats64, }; static int ieee80211_netdev_fill_forward_path(struct net_device_path_ctx *ctx, struct net_device_path *path) { struct ieee80211_sub_if_data *sdata; struct ieee80211_local *local; struct sta_info *sta; int ret = -ENOENT; sdata = IEEE80211_DEV_TO_SUB_IF(ctx->dev); local = sdata->local; if (!local->ops->net_fill_forward_path) return -EOPNOTSUPP; rcu_read_lock(); switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: sta = rcu_dereference(sdata->u.vlan.sta); if (sta) break; if (sdata->wdev.use_4addr) goto out; if (is_multicast_ether_addr(ctx->daddr)) goto out; sta = sta_info_get_bss(sdata, ctx->daddr); break; case NL80211_IFTYPE_AP: if (is_multicast_ether_addr(ctx->daddr)) goto out; sta = sta_info_get(sdata, ctx->daddr); break; case NL80211_IFTYPE_STATION: if (sdata->wdev.wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS) { sta = sta_info_get(sdata, ctx->daddr); if (sta && test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH)) goto out; break; } } sta = sta_info_get(sdata, sdata->deflink.u.mgd.bssid); break; default: goto out; } if (!sta) goto out; ret = drv_net_fill_forward_path(local, sdata, &sta->sta, ctx, path); out: rcu_read_unlock(); return ret; } static const struct net_device_ops ieee80211_dataif_8023_ops = { .ndo_open = ieee80211_open, .ndo_stop = ieee80211_stop, .ndo_uninit = ieee80211_uninit, .ndo_start_xmit = ieee80211_subif_start_xmit_8023, .ndo_set_rx_mode = ieee80211_set_multicast_list, .ndo_set_mac_address = ieee80211_change_mac, .ndo_select_queue = ieee80211_netdev_select_queue, .ndo_get_stats64 = ieee80211_get_stats64, .ndo_fill_forward_path = ieee80211_netdev_fill_forward_path, }; static bool ieee80211_iftype_supports_hdr_offload(enum nl80211_iftype iftype) { switch (iftype) { /* P2P GO and client are mapped to AP/STATION types */ case NL80211_IFTYPE_AP: case NL80211_IFTYPE_STATION: return true; default: return false; } } static bool ieee80211_set_sdata_offload_flags(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; u32 flags; flags = sdata->vif.offload_flags; if (ieee80211_hw_check(&local->hw, SUPPORTS_TX_ENCAP_OFFLOAD) && ieee80211_iftype_supports_hdr_offload(sdata->vif.type)) { flags |= IEEE80211_OFFLOAD_ENCAP_ENABLED; if (!ieee80211_hw_check(&local->hw, SUPPORTS_TX_FRAG) && local->hw.wiphy->frag_threshold != (u32)-1) flags &= ~IEEE80211_OFFLOAD_ENCAP_ENABLED; if (local->monitors) flags &= ~IEEE80211_OFFLOAD_ENCAP_ENABLED; } else { flags &= ~IEEE80211_OFFLOAD_ENCAP_ENABLED; } if (ieee80211_hw_check(&local->hw, SUPPORTS_RX_DECAP_OFFLOAD) && ieee80211_iftype_supports_hdr_offload(sdata->vif.type)) { flags |= IEEE80211_OFFLOAD_DECAP_ENABLED; if (local->monitors && !ieee80211_hw_check(&local->hw, SUPPORTS_CONC_MON_RX_DECAP)) flags &= ~IEEE80211_OFFLOAD_DECAP_ENABLED; } else { flags &= ~IEEE80211_OFFLOAD_DECAP_ENABLED; } if (sdata->vif.offload_flags == flags) return false; sdata->vif.offload_flags = flags; ieee80211_check_fast_rx_iface(sdata); return true; } static void ieee80211_set_vif_encap_ops(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *bss = sdata; bool enabled; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { if (!sdata->bss) return; bss = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); } if (!ieee80211_hw_check(&local->hw, SUPPORTS_TX_ENCAP_OFFLOAD) || !ieee80211_iftype_supports_hdr_offload(bss->vif.type)) return; enabled = bss->vif.offload_flags & IEEE80211_OFFLOAD_ENCAP_ENABLED; if (sdata->wdev.use_4addr && !(bss->vif.offload_flags & IEEE80211_OFFLOAD_ENCAP_4ADDR)) enabled = false; sdata->dev->netdev_ops = enabled ? &ieee80211_dataif_8023_ops : &ieee80211_dataif_ops; } static void ieee80211_recalc_sdata_offload(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *vsdata; if (ieee80211_set_sdata_offload_flags(sdata)) { drv_update_vif_offload(local, sdata); ieee80211_set_vif_encap_ops(sdata); } list_for_each_entry(vsdata, &local->interfaces, list) { if (vsdata->vif.type != NL80211_IFTYPE_AP_VLAN || vsdata->bss != &sdata->u.ap) continue; ieee80211_set_vif_encap_ops(vsdata); } } void ieee80211_recalc_offload(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; if (!ieee80211_hw_check(&local->hw, SUPPORTS_TX_ENCAP_OFFLOAD)) return; mutex_lock(&local->iflist_mtx); list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; ieee80211_recalc_sdata_offload(sdata); } mutex_unlock(&local->iflist_mtx); } void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata, const int offset) { struct ieee80211_local *local = sdata->local; u32 flags = sdata->u.mntr.flags; #define ADJUST(_f, _s) do { \ if (flags & MONITOR_FLAG_##_f) \ local->fif_##_s += offset; \ } while (0) ADJUST(FCSFAIL, fcsfail); ADJUST(PLCPFAIL, plcpfail); ADJUST(CONTROL, control); ADJUST(CONTROL, pspoll); ADJUST(OTHER_BSS, other_bss); #undef ADJUST } static void ieee80211_set_default_queues(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; int i; for (i = 0; i < IEEE80211_NUM_ACS; i++) { if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) sdata->vif.hw_queue[i] = IEEE80211_INVAL_HW_QUEUE; else if (local->hw.queues >= IEEE80211_NUM_ACS) sdata->vif.hw_queue[i] = i; else sdata->vif.hw_queue[i] = 0; } sdata->vif.cab_queue = IEEE80211_INVAL_HW_QUEUE; } static void ieee80211_sdata_init(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { sdata->local = local; /* * Initialize the default link, so we can use link_id 0 for non-MLD, * and that continues to work for non-MLD-aware drivers that use just * vif.bss_conf instead of vif.link_conf. * * Note that we never change this, so if link ID 0 isn't used in an * MLD connection, we get a separate allocation for it. */ ieee80211_link_init(sdata, -1, &sdata->deflink, &sdata->vif.bss_conf); } int ieee80211_add_virtual_monitor(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; int ret; if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) return 0; ASSERT_RTNL(); lockdep_assert_wiphy(local->hw.wiphy); if (local->monitor_sdata) return 0; sdata = kzalloc(sizeof(*sdata) + local->hw.vif_data_size, GFP_KERNEL); if (!sdata) return -ENOMEM; /* set up data */ sdata->vif.type = NL80211_IFTYPE_MONITOR; snprintf(sdata->name, IFNAMSIZ, "%s-monitor", wiphy_name(local->hw.wiphy)); sdata->wdev.iftype = NL80211_IFTYPE_MONITOR; ieee80211_sdata_init(local, sdata); ieee80211_set_default_queues(sdata); ret = drv_add_interface(local, sdata); if (WARN_ON(ret)) { /* ok .. stupid driver, it asked for this! */ kfree(sdata); return ret; } set_bit(SDATA_STATE_RUNNING, &sdata->state); ret = ieee80211_check_queues(sdata, NL80211_IFTYPE_MONITOR); if (ret) { kfree(sdata); return ret; } mutex_lock(&local->iflist_mtx); rcu_assign_pointer(local->monitor_sdata, sdata); mutex_unlock(&local->iflist_mtx); mutex_lock(&local->mtx); ret = ieee80211_link_use_channel(&sdata->deflink, &local->monitor_chandef, IEEE80211_CHANCTX_EXCLUSIVE); mutex_unlock(&local->mtx); if (ret) { mutex_lock(&local->iflist_mtx); RCU_INIT_POINTER(local->monitor_sdata, NULL); mutex_unlock(&local->iflist_mtx); synchronize_net(); drv_remove_interface(local, sdata); kfree(sdata); return ret; } skb_queue_head_init(&sdata->skb_queue); skb_queue_head_init(&sdata->status_queue); INIT_WORK(&sdata->work, ieee80211_iface_work); return 0; } void ieee80211_del_virtual_monitor(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) return; ASSERT_RTNL(); lockdep_assert_wiphy(local->hw.wiphy); mutex_lock(&local->iflist_mtx); sdata = rcu_dereference_protected(local->monitor_sdata, lockdep_is_held(&local->iflist_mtx)); if (!sdata) { mutex_unlock(&local->iflist_mtx); return; } RCU_INIT_POINTER(local->monitor_sdata, NULL); mutex_unlock(&local->iflist_mtx); synchronize_net(); mutex_lock(&local->mtx); ieee80211_link_release_channel(&sdata->deflink); mutex_unlock(&local->mtx); drv_remove_interface(local, sdata); kfree(sdata); } /* * NOTE: Be very careful when changing this function, it must NOT return * an error on interface type changes that have been pre-checked, so most * checks should be in ieee80211_check_concurrent_iface. */ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct net_device *dev = wdev->netdev; struct ieee80211_local *local = sdata->local; u32 changed = 0; int res; u32 hw_reconf_flags = 0; switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: { struct ieee80211_sub_if_data *master; if (!sdata->bss) return -ENOLINK; mutex_lock(&local->mtx); list_add(&sdata->u.vlan.list, &sdata->bss->vlans); mutex_unlock(&local->mtx); master = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); sdata->control_port_protocol = master->control_port_protocol; sdata->control_port_no_encrypt = master->control_port_no_encrypt; sdata->control_port_over_nl80211 = master->control_port_over_nl80211; sdata->control_port_no_preauth = master->control_port_no_preauth; sdata->vif.cab_queue = master->vif.cab_queue; memcpy(sdata->vif.hw_queue, master->vif.hw_queue, sizeof(sdata->vif.hw_queue)); sdata->vif.bss_conf.chandef = master->vif.bss_conf.chandef; mutex_lock(&local->key_mtx); sdata->crypto_tx_tailroom_needed_cnt += master->crypto_tx_tailroom_needed_cnt; mutex_unlock(&local->key_mtx); break; } case NL80211_IFTYPE_AP: sdata->bss = &sdata->u.ap; break; case NL80211_IFTYPE_MESH_POINT: case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_OCB: case NL80211_IFTYPE_NAN: /* no special treatment */ break; case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_WDS: /* cannot happen */ WARN_ON(1); break; } if (local->open_count == 0) { res = drv_start(local); if (res) goto err_del_bss; /* we're brought up, everything changes */ hw_reconf_flags = ~0; ieee80211_led_radio(local, true); ieee80211_mod_tpt_led_trig(local, IEEE80211_TPT_LEDTRIG_FL_RADIO, 0); } /* * Copy the hopefully now-present MAC address to * this interface, if it has the special null one. */ if (dev && is_zero_ether_addr(dev->dev_addr)) { eth_hw_addr_set(dev, local->hw.wiphy->perm_addr); memcpy(dev->perm_addr, dev->dev_addr, ETH_ALEN); if (!is_valid_ether_addr(dev->dev_addr)) { res = -EADDRNOTAVAIL; goto err_stop; } } switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: /* no need to tell driver, but set carrier and chanctx */ if (sdata->bss->active) { ieee80211_link_vlan_copy_chanctx(&sdata->deflink); netif_carrier_on(dev); ieee80211_set_vif_encap_ops(sdata); } else { netif_carrier_off(dev); } break; case NL80211_IFTYPE_MONITOR: if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) { local->cooked_mntrs++; break; } if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) { res = drv_add_interface(local, sdata); if (res) goto err_stop; } else if (local->monitors == 0 && local->open_count == 0) { res = ieee80211_add_virtual_monitor(local); if (res) goto err_stop; } /* must be before the call to ieee80211_configure_filter */ local->monitors++; if (local->monitors == 1) { local->hw.conf.flags |= IEEE80211_CONF_MONITOR; hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR; } ieee80211_adjust_monitor_flags(sdata, 1); ieee80211_configure_filter(local); ieee80211_recalc_offload(local); mutex_lock(&local->mtx); ieee80211_recalc_idle(local); mutex_unlock(&local->mtx); netif_carrier_on(dev); break; default: if (coming_up) { ieee80211_del_virtual_monitor(local); ieee80211_set_sdata_offload_flags(sdata); res = drv_add_interface(local, sdata); if (res) goto err_stop; ieee80211_set_vif_encap_ops(sdata); res = ieee80211_check_queues(sdata, ieee80211_vif_type_p2p(&sdata->vif)); if (res) goto err_del_interface; } if (sdata->vif.type == NL80211_IFTYPE_AP) { local->fif_pspoll++; local->fif_probe_req++; ieee80211_configure_filter(local); } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { local->fif_probe_req++; } if (sdata->vif.probe_req_reg) drv_config_iface_filter(local, sdata, FIF_PROBE_REQ, FIF_PROBE_REQ); if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && sdata->vif.type != NL80211_IFTYPE_NAN) changed |= ieee80211_reset_erp_info(sdata); ieee80211_link_info_change_notify(sdata, &sdata->deflink, changed); switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_AP: case NL80211_IFTYPE_MESH_POINT: case NL80211_IFTYPE_OCB: netif_carrier_off(dev); break; case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_NAN: break; default: /* not reached */ WARN_ON(1); } /* * Set default queue parameters so drivers don't * need to initialise the hardware if the hardware * doesn't start up with sane defaults. * Enable QoS for anything but station interfaces. */ ieee80211_set_wmm_default(&sdata->deflink, true, sdata->vif.type != NL80211_IFTYPE_STATION); } switch (sdata->vif.type) { case NL80211_IFTYPE_P2P_DEVICE: rcu_assign_pointer(local->p2p_sdata, sdata); break; case NL80211_IFTYPE_MONITOR: if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) break; list_add_tail_rcu(&sdata->u.mntr.list, &local->mon_list); break; default: break; } /* * set_multicast_list will be invoked by the networking core * which will check whether any increments here were done in * error and sync them down to the hardware as filter flags. */ if (sdata->flags & IEEE80211_SDATA_ALLMULTI) atomic_inc(&local->iff_allmultis); if (coming_up) local->open_count++; if (hw_reconf_flags) ieee80211_hw_config(local, hw_reconf_flags); ieee80211_recalc_ps(local); if (sdata->vif.type == NL80211_IFTYPE_MONITOR || sdata->vif.type == NL80211_IFTYPE_AP_VLAN || local->ops->wake_tx_queue) { /* XXX: for AP_VLAN, actually track AP queues */ if (dev) netif_tx_start_all_queues(dev); } else if (dev) { unsigned long flags; int n_acs = IEEE80211_NUM_ACS; int ac; if (local->hw.queues < IEEE80211_NUM_ACS) n_acs = 1; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); if (sdata->vif.cab_queue == IEEE80211_INVAL_HW_QUEUE || (local->queue_stop_reasons[sdata->vif.cab_queue] == 0 && skb_queue_empty(&local->pending[sdata->vif.cab_queue]))) { for (ac = 0; ac < n_acs; ac++) { int ac_queue = sdata->vif.hw_queue[ac]; if (local->queue_stop_reasons[ac_queue] == 0 && skb_queue_empty(&local->pending[ac_queue])) netif_start_subqueue(dev, ac); } } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } set_bit(SDATA_STATE_RUNNING, &sdata->state); return 0; err_del_interface: drv_remove_interface(local, sdata); err_stop: if (!local->open_count) drv_stop(local); err_del_bss: sdata->bss = NULL; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { mutex_lock(&local->mtx); list_del(&sdata->u.vlan.list); mutex_unlock(&local->mtx); } /* might already be clear but that doesn't matter */ clear_bit(SDATA_STATE_RUNNING, &sdata->state); return res; } static void ieee80211_if_free(struct net_device *dev) { free_percpu(dev->tstats); } static void ieee80211_if_setup(struct net_device *dev) { ether_setup(dev); dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->netdev_ops = &ieee80211_dataif_ops; dev->needs_free_netdev = true; dev->priv_destructor = ieee80211_if_free; } static void ieee80211_if_setup_no_queue(struct net_device *dev) { ieee80211_if_setup(dev); dev->priv_flags |= IFF_NO_QUEUE; } static void ieee80211_iface_process_skb(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_mgmt *mgmt = (void *)skb->data; if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_BACK) { struct sta_info *sta; int len = skb->len; mutex_lock(&local->sta_mtx); sta = sta_info_get_bss(sdata, mgmt->sa); if (sta) { switch (mgmt->u.action.u.addba_req.action_code) { case WLAN_ACTION_ADDBA_REQ: ieee80211_process_addba_request(local, sta, mgmt, len); break; case WLAN_ACTION_ADDBA_RESP: ieee80211_process_addba_resp(local, sta, mgmt, len); break; case WLAN_ACTION_DELBA: ieee80211_process_delba(sdata, sta, mgmt, len); break; default: WARN_ON(1); break; } } mutex_unlock(&local->sta_mtx); } else if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_VHT) { switch (mgmt->u.action.u.vht_group_notif.action_code) { case WLAN_VHT_ACTION_OPMODE_NOTIF: { struct ieee80211_rx_status *status; enum nl80211_band band; struct sta_info *sta; u8 opmode; status = IEEE80211_SKB_RXCB(skb); band = status->band; opmode = mgmt->u.action.u.vht_opmode_notif.operating_mode; mutex_lock(&local->sta_mtx); sta = sta_info_get_bss(sdata, mgmt->sa); if (sta) ieee80211_vht_handle_opmode(sdata, &sta->deflink, opmode, band); mutex_unlock(&local->sta_mtx); break; } case WLAN_VHT_ACTION_GROUPID_MGMT: ieee80211_process_mu_groups(sdata, &sdata->deflink, mgmt); break; default: WARN_ON(1); break; } } else if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_S1G) { switch (mgmt->u.action.u.s1g.action_code) { case WLAN_S1G_TWT_TEARDOWN: case WLAN_S1G_TWT_SETUP: ieee80211_s1g_rx_twt_action(sdata, skb); break; default: break; } } else if (ieee80211_is_ext(mgmt->frame_control)) { if (sdata->vif.type == NL80211_IFTYPE_STATION) ieee80211_sta_rx_queued_ext(sdata, skb); else WARN_ON(1); } else if (ieee80211_is_data_qos(mgmt->frame_control)) { struct ieee80211_hdr *hdr = (void *)mgmt; struct sta_info *sta; /* * So the frame isn't mgmt, but frame_control * is at the right place anyway, of course, so * the if statement is correct. * * Warn if we have other data frame types here, * they must not get here. */ WARN_ON(hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_NULLFUNC)); WARN_ON(!(hdr->seq_ctrl & cpu_to_le16(IEEE80211_SCTL_FRAG))); /* * This was a fragment of a frame, received while * a block-ack session was active. That cannot be * right, so terminate the session. */ mutex_lock(&local->sta_mtx); sta = sta_info_get_bss(sdata, mgmt->sa); if (sta) { u16 tid = ieee80211_get_tid(hdr); __ieee80211_stop_rx_ba_session( sta, tid, WLAN_BACK_RECIPIENT, WLAN_REASON_QSTA_REQUIRE_SETUP, true); } mutex_unlock(&local->sta_mtx); } else switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: ieee80211_sta_rx_queued_mgmt(sdata, skb); break; case NL80211_IFTYPE_ADHOC: ieee80211_ibss_rx_queued_mgmt(sdata, skb); break; case NL80211_IFTYPE_MESH_POINT: if (!ieee80211_vif_is_mesh(&sdata->vif)) break; ieee80211_mesh_rx_queued_mgmt(sdata, skb); break; default: WARN(1, "frame for unexpected interface type"); break; } } static void ieee80211_iface_process_status(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_mgmt *mgmt = (void *)skb->data; if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_S1G) { switch (mgmt->u.action.u.s1g.action_code) { case WLAN_S1G_TWT_TEARDOWN: case WLAN_S1G_TWT_SETUP: ieee80211_s1g_status_twt_action(sdata, skb); break; default: break; } } } static void ieee80211_iface_work(struct work_struct *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, work); struct ieee80211_local *local = sdata->local; struct sk_buff *skb; if (!ieee80211_sdata_running(sdata)) return; if (test_bit(SCAN_SW_SCANNING, &local->scanning)) return; if (!ieee80211_can_run_worker(local)) return; /* first process frames */ while ((skb = skb_dequeue(&sdata->skb_queue))) { kcov_remote_start_common(skb_get_kcov_handle(skb)); if (skb->protocol == cpu_to_be16(ETH_P_TDLS)) ieee80211_process_tdls_channel_switch(sdata, skb); else ieee80211_iface_process_skb(local, sdata, skb); kfree_skb(skb); kcov_remote_stop(); } /* process status queue */ while ((skb = skb_dequeue(&sdata->status_queue))) { kcov_remote_start_common(skb_get_kcov_handle(skb)); ieee80211_iface_process_status(sdata, skb); kfree_skb(skb); kcov_remote_stop(); } /* then other type-dependent work */ switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: ieee80211_sta_work(sdata); break; case NL80211_IFTYPE_ADHOC: ieee80211_ibss_work(sdata); break; case NL80211_IFTYPE_MESH_POINT: if (!ieee80211_vif_is_mesh(&sdata->vif)) break; ieee80211_mesh_work(sdata); break; case NL80211_IFTYPE_OCB: ieee80211_ocb_work(sdata); break; default: break; } } static void ieee80211_recalc_smps_work(struct work_struct *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, recalc_smps); ieee80211_recalc_smps(sdata, &sdata->deflink); } static void ieee80211_activate_links_work(struct work_struct *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, activate_links_work); ieee80211_set_active_links(&sdata->vif, sdata->desired_active_links); } /* * Helper function to initialise an interface to a specific type. */ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, enum nl80211_iftype type) { static const u8 bssid_wildcard[ETH_ALEN] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; /* clear type-dependent unions */ memset(&sdata->u, 0, sizeof(sdata->u)); memset(&sdata->deflink.u, 0, sizeof(sdata->deflink.u)); /* and set some type-dependent values */ sdata->vif.type = type; sdata->vif.p2p = false; sdata->wdev.iftype = type; sdata->control_port_protocol = cpu_to_be16(ETH_P_PAE); sdata->control_port_no_encrypt = false; sdata->control_port_over_nl80211 = false; sdata->control_port_no_preauth = false; sdata->vif.cfg.idle = true; sdata->vif.bss_conf.txpower = INT_MIN; /* unset */ sdata->noack_map = 0; /* only monitor/p2p-device differ */ if (sdata->dev) { sdata->dev->netdev_ops = &ieee80211_dataif_ops; sdata->dev->type = ARPHRD_ETHER; } skb_queue_head_init(&sdata->skb_queue); skb_queue_head_init(&sdata->status_queue); INIT_WORK(&sdata->work, ieee80211_iface_work); INIT_WORK(&sdata->recalc_smps, ieee80211_recalc_smps_work); INIT_WORK(&sdata->activate_links_work, ieee80211_activate_links_work); switch (type) { case NL80211_IFTYPE_P2P_GO: type = NL80211_IFTYPE_AP; sdata->vif.type = type; sdata->vif.p2p = true; fallthrough; case NL80211_IFTYPE_AP: skb_queue_head_init(&sdata->u.ap.ps.bc_buf); INIT_LIST_HEAD(&sdata->u.ap.vlans); sdata->vif.bss_conf.bssid = sdata->vif.addr; break; case NL80211_IFTYPE_P2P_CLIENT: type = NL80211_IFTYPE_STATION; sdata->vif.type = type; sdata->vif.p2p = true; fallthrough; case NL80211_IFTYPE_STATION: sdata->vif.bss_conf.bssid = sdata->deflink.u.mgd.bssid; ieee80211_sta_setup_sdata(sdata); break; case NL80211_IFTYPE_OCB: sdata->vif.bss_conf.bssid = bssid_wildcard; ieee80211_ocb_setup_sdata(sdata); break; case NL80211_IFTYPE_ADHOC: sdata->vif.bss_conf.bssid = sdata->u.ibss.bssid; ieee80211_ibss_setup_sdata(sdata); break; case NL80211_IFTYPE_MESH_POINT: if (ieee80211_vif_is_mesh(&sdata->vif)) ieee80211_mesh_init_sdata(sdata); break; case NL80211_IFTYPE_MONITOR: sdata->dev->type = ARPHRD_IEEE80211_RADIOTAP; sdata->dev->netdev_ops = &ieee80211_monitorif_ops; sdata->u.mntr.flags = MONITOR_FLAG_CONTROL | MONITOR_FLAG_OTHER_BSS; break; case NL80211_IFTYPE_NAN: idr_init(&sdata->u.nan.function_inst_ids); spin_lock_init(&sdata->u.nan.func_lock); sdata->vif.bss_conf.bssid = sdata->vif.addr; break; case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_P2P_DEVICE: sdata->vif.bss_conf.bssid = sdata->vif.addr; break; case NL80211_IFTYPE_UNSPECIFIED: case NL80211_IFTYPE_WDS: case NUM_NL80211_IFTYPES: WARN_ON(1); break; } /* need to do this after the switch so vif.type is correct */ ieee80211_link_setup(&sdata->deflink); ieee80211_debugfs_add_netdev(sdata); } static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata, enum nl80211_iftype type) { struct ieee80211_local *local = sdata->local; int ret, err; enum nl80211_iftype internal_type = type; bool p2p = false; ASSERT_RTNL(); if (!local->ops->change_interface) return -EBUSY; /* for now, don't support changing while links exist */ if (sdata->vif.valid_links) return -EBUSY; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: if (!list_empty(&sdata->u.ap.vlans)) return -EBUSY; break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_OCB: /* * Could maybe also all others here? * Just not sure how that interacts * with the RX/config path e.g. for * mesh. */ break; default: return -EBUSY; } switch (type) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_OCB: /* * Could probably support everything * but here. */ break; case NL80211_IFTYPE_P2P_CLIENT: p2p = true; internal_type = NL80211_IFTYPE_STATION; break; case NL80211_IFTYPE_P2P_GO: p2p = true; internal_type = NL80211_IFTYPE_AP; break; default: return -EBUSY; } ret = ieee80211_check_concurrent_iface(sdata, internal_type); if (ret) return ret; ieee80211_stop_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_IFTYPE_CHANGE); synchronize_net(); ieee80211_do_stop(sdata, false); ieee80211_teardown_sdata(sdata); ieee80211_set_sdata_offload_flags(sdata); ret = drv_change_interface(local, sdata, internal_type, p2p); if (ret) type = ieee80211_vif_type_p2p(&sdata->vif); /* * Ignore return value here, there's not much we can do since * the driver changed the interface type internally already. * The warnings will hopefully make driver authors fix it :-) */ ieee80211_check_queues(sdata, type); ieee80211_setup_sdata(sdata, type); ieee80211_set_vif_encap_ops(sdata); err = ieee80211_do_open(&sdata->wdev, false); WARN(err, "type change: do_open returned %d", err); ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_IFTYPE_CHANGE); return ret; } int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata, enum nl80211_iftype type) { int ret; ASSERT_RTNL(); if (type == ieee80211_vif_type_p2p(&sdata->vif)) return 0; if (ieee80211_sdata_running(sdata)) { ret = ieee80211_runtime_change_iftype(sdata, type); if (ret) return ret; } else { /* Purge and reset type-dependent state. */ ieee80211_teardown_sdata(sdata); ieee80211_setup_sdata(sdata, type); } /* reset some values that shouldn't be kept across type changes */ if (type == NL80211_IFTYPE_STATION) sdata->u.mgd.use_4addr = false; return 0; } static void ieee80211_assign_perm_addr(struct ieee80211_local *local, u8 *perm_addr, enum nl80211_iftype type) { struct ieee80211_sub_if_data *sdata; u64 mask, start, addr, val, inc; u8 *m; u8 tmp_addr[ETH_ALEN]; int i; /* default ... something at least */ memcpy(perm_addr, local->hw.wiphy->perm_addr, ETH_ALEN); if (is_zero_ether_addr(local->hw.wiphy->addr_mask) && local->hw.wiphy->n_addresses <= 1) return; mutex_lock(&local->iflist_mtx); switch (type) { case NL80211_IFTYPE_MONITOR: /* doesn't matter */ break; case NL80211_IFTYPE_AP_VLAN: /* match up with an AP interface */ list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type != NL80211_IFTYPE_AP) continue; memcpy(perm_addr, sdata->vif.addr, ETH_ALEN); break; } /* keep default if no AP interface present */ break; case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: if (ieee80211_hw_check(&local->hw, P2P_DEV_ADDR_FOR_INTF)) { list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE) continue; if (!ieee80211_sdata_running(sdata)) continue; memcpy(perm_addr, sdata->vif.addr, ETH_ALEN); goto out_unlock; } } fallthrough; default: /* assign a new address if possible -- try n_addresses first */ for (i = 0; i < local->hw.wiphy->n_addresses; i++) { bool used = false; list_for_each_entry(sdata, &local->interfaces, list) { if (ether_addr_equal(local->hw.wiphy->addresses[i].addr, sdata->vif.addr)) { used = true; break; } } if (!used) { memcpy(perm_addr, local->hw.wiphy->addresses[i].addr, ETH_ALEN); break; } } /* try mask if available */ if (is_zero_ether_addr(local->hw.wiphy->addr_mask)) break; m = local->hw.wiphy->addr_mask; mask = ((u64)m[0] << 5*8) | ((u64)m[1] << 4*8) | ((u64)m[2] << 3*8) | ((u64)m[3] << 2*8) | ((u64)m[4] << 1*8) | ((u64)m[5] << 0*8); if (__ffs64(mask) + hweight64(mask) != fls64(mask)) { /* not a contiguous mask ... not handled now! */ pr_info("not contiguous\n"); break; } /* * Pick address of existing interface in case user changed * MAC address manually, default to perm_addr. */ m = local->hw.wiphy->perm_addr; list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_MONITOR) continue; m = sdata->vif.addr; break; } start = ((u64)m[0] << 5*8) | ((u64)m[1] << 4*8) | ((u64)m[2] << 3*8) | ((u64)m[3] << 2*8) | ((u64)m[4] << 1*8) | ((u64)m[5] << 0*8); inc = 1ULL<<__ffs64(mask); val = (start & mask); addr = (start & ~mask) | (val & mask); do { bool used = false; tmp_addr[5] = addr >> 0*8; tmp_addr[4] = addr >> 1*8; tmp_addr[3] = addr >> 2*8; tmp_addr[2] = addr >> 3*8; tmp_addr[1] = addr >> 4*8; tmp_addr[0] = addr >> 5*8; val += inc; list_for_each_entry(sdata, &local->interfaces, list) { if (ether_addr_equal(tmp_addr, sdata->vif.addr)) { used = true; break; } } if (!used) { memcpy(perm_addr, tmp_addr, ETH_ALEN); break; } addr = (start & ~mask) | (val & mask); } while (addr != start); break; } out_unlock: mutex_unlock(&local->iflist_mtx); } int ieee80211_if_add(struct ieee80211_local *local, const char *name, unsigned char name_assign_type, struct wireless_dev **new_wdev, enum nl80211_iftype type, struct vif_params *params) { struct net_device *ndev = NULL; struct ieee80211_sub_if_data *sdata = NULL; struct txq_info *txqi; void (*if_setup)(struct net_device *dev); int ret, i; int txqs = 1; ASSERT_RTNL(); if (type == NL80211_IFTYPE_P2P_DEVICE || type == NL80211_IFTYPE_NAN) { struct wireless_dev *wdev; sdata = kzalloc(sizeof(*sdata) + local->hw.vif_data_size, GFP_KERNEL); if (!sdata) return -ENOMEM; wdev = &sdata->wdev; sdata->dev = NULL; strscpy(sdata->name, name, IFNAMSIZ); ieee80211_assign_perm_addr(local, wdev->address, type); memcpy(sdata->vif.addr, wdev->address, ETH_ALEN); ether_addr_copy(sdata->vif.bss_conf.addr, sdata->vif.addr); } else { int size = ALIGN(sizeof(*sdata) + local->hw.vif_data_size, sizeof(void *)); int txq_size = 0; if (local->ops->wake_tx_queue && type != NL80211_IFTYPE_AP_VLAN && (type != NL80211_IFTYPE_MONITOR || (params->flags & MONITOR_FLAG_ACTIVE))) txq_size += sizeof(struct txq_info) + local->hw.txq_data_size; if (local->ops->wake_tx_queue) { if_setup = ieee80211_if_setup_no_queue; } else { if_setup = ieee80211_if_setup; if (local->hw.queues >= IEEE80211_NUM_ACS) txqs = IEEE80211_NUM_ACS; } ndev = alloc_netdev_mqs(size + txq_size, name, name_assign_type, if_setup, txqs, 1); if (!ndev) return -ENOMEM; if (!local->ops->wake_tx_queue && local->hw.wiphy->tx_queue_len) ndev->tx_queue_len = local->hw.wiphy->tx_queue_len; dev_net_set(ndev, wiphy_net(local->hw.wiphy)); ndev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!ndev->tstats) { free_netdev(ndev); return -ENOMEM; } ndev->needed_headroom = local->tx_headroom + 4*6 /* four MAC addresses */ + 2 + 2 + 2 + 2 /* ctl, dur, seq, qos */ + 6 /* mesh */ + 8 /* rfc1042/bridge tunnel */ - ETH_HLEN /* ethernet hard_header_len */ + IEEE80211_ENCRYPT_HEADROOM; ndev->needed_tailroom = IEEE80211_ENCRYPT_TAILROOM; ret = dev_alloc_name(ndev, ndev->name); if (ret < 0) { ieee80211_if_free(ndev); free_netdev(ndev); return ret; } ieee80211_assign_perm_addr(local, ndev->perm_addr, type); if (is_valid_ether_addr(params->macaddr)) eth_hw_addr_set(ndev, params->macaddr); else eth_hw_addr_set(ndev, ndev->perm_addr); SET_NETDEV_DEV(ndev, wiphy_dev(local->hw.wiphy)); /* don't use IEEE80211_DEV_TO_SUB_IF -- it checks too much */ sdata = netdev_priv(ndev); ndev->ieee80211_ptr = &sdata->wdev; memcpy(sdata->vif.addr, ndev->dev_addr, ETH_ALEN); ether_addr_copy(sdata->vif.bss_conf.addr, sdata->vif.addr); memcpy(sdata->name, ndev->name, IFNAMSIZ); if (txq_size) { txqi = netdev_priv(ndev) + size; ieee80211_txq_init(sdata, NULL, txqi, 0); } sdata->dev = ndev; } /* initialise type-independent data */ sdata->wdev.wiphy = local->hw.wiphy; ieee80211_sdata_init(local, sdata); ieee80211_init_frag_cache(&sdata->frags); INIT_LIST_HEAD(&sdata->key_list); INIT_DELAYED_WORK(&sdata->dec_tailroom_needed_wk, ieee80211_delayed_tailroom_dec); for (i = 0; i < NUM_NL80211_BANDS; i++) { struct ieee80211_supported_band *sband; sband = local->hw.wiphy->bands[i]; sdata->rc_rateidx_mask[i] = sband ? (1 << sband->n_bitrates) - 1 : 0; if (sband) { __le16 cap; u16 *vht_rate_mask; memcpy(sdata->rc_rateidx_mcs_mask[i], sband->ht_cap.mcs.rx_mask, sizeof(sdata->rc_rateidx_mcs_mask[i])); cap = sband->vht_cap.vht_mcs.rx_mcs_map; vht_rate_mask = sdata->rc_rateidx_vht_mcs_mask[i]; ieee80211_get_vht_mask_from_cap(cap, vht_rate_mask); } else { memset(sdata->rc_rateidx_mcs_mask[i], 0, sizeof(sdata->rc_rateidx_mcs_mask[i])); memset(sdata->rc_rateidx_vht_mcs_mask[i], 0, sizeof(sdata->rc_rateidx_vht_mcs_mask[i])); } } ieee80211_set_default_queues(sdata); sdata->deflink.ap_power_level = IEEE80211_UNSET_POWER_LEVEL; sdata->deflink.user_power_level = local->user_power_level; /* setup type-dependent data */ ieee80211_setup_sdata(sdata, type); if (ndev) { ndev->ieee80211_ptr->use_4addr = params->use_4addr; if (type == NL80211_IFTYPE_STATION) sdata->u.mgd.use_4addr = params->use_4addr; ndev->features |= local->hw.netdev_features; ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE; ndev->hw_features |= ndev->features & MAC80211_SUPPORTED_FEATURES_TX; netdev_set_default_ethtool_ops(ndev, &ieee80211_ethtool_ops); /* MTU range is normally 256 - 2304, where the upper limit is * the maximum MSDU size. Monitor interfaces send and receive * MPDU and A-MSDU frames which may be much larger so we do * not impose an upper limit in that case. */ ndev->min_mtu = 256; if (type == NL80211_IFTYPE_MONITOR) ndev->max_mtu = 0; else ndev->max_mtu = local->hw.max_mtu; ret = cfg80211_register_netdevice(ndev); if (ret) { free_netdev(ndev); return ret; } } mutex_lock(&local->iflist_mtx); list_add_tail_rcu(&sdata->list, &local->interfaces); mutex_unlock(&local->iflist_mtx); if (new_wdev) *new_wdev = &sdata->wdev; return 0; } void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata) { ASSERT_RTNL(); mutex_lock(&sdata->local->iflist_mtx); list_del_rcu(&sdata->list); mutex_unlock(&sdata->local->iflist_mtx); if (sdata->vif.txq) ieee80211_txq_purge(sdata->local, to_txq_info(sdata->vif.txq)); synchronize_rcu(); cfg80211_unregister_wdev(&sdata->wdev); if (!sdata->dev) { ieee80211_teardown_sdata(sdata); kfree(sdata); } } void ieee80211_sdata_stop(struct ieee80211_sub_if_data *sdata) { if (WARN_ON_ONCE(!test_bit(SDATA_STATE_RUNNING, &sdata->state))) return; ieee80211_do_stop(sdata, true); } void ieee80211_remove_interfaces(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata, *tmp; LIST_HEAD(unreg_list); LIST_HEAD(wdev_list); ASSERT_RTNL(); /* Before destroying the interfaces, make sure they're all stopped so * that the hardware is stopped. Otherwise, the driver might still be * iterating the interfaces during the shutdown, e.g. from a worker * or from RX processing or similar, and if it does so (using atomic * iteration) while we're manipulating the list, the iteration will * crash. * * After this, the hardware should be stopped and the driver should * have stopped all of its activities, so that we can do RCU-unaware * manipulations of the interface list below. */ cfg80211_shutdown_all_interfaces(local->hw.wiphy); WARN(local->open_count, "%s: open count remains %d\n", wiphy_name(local->hw.wiphy), local->open_count); ieee80211_txq_teardown_flows(local); mutex_lock(&local->iflist_mtx); list_for_each_entry_safe(sdata, tmp, &local->interfaces, list) { list_del(&sdata->list); if (sdata->dev) unregister_netdevice_queue(sdata->dev, &unreg_list); else list_add(&sdata->list, &wdev_list); } mutex_unlock(&local->iflist_mtx); unregister_netdevice_many(&unreg_list); wiphy_lock(local->hw.wiphy); list_for_each_entry_safe(sdata, tmp, &wdev_list, list) { list_del(&sdata->list); cfg80211_unregister_wdev(&sdata->wdev); kfree(sdata); } wiphy_unlock(local->hw.wiphy); } static int netdev_notify(struct notifier_block *nb, unsigned long state, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct ieee80211_sub_if_data *sdata; if (state != NETDEV_CHANGENAME) return NOTIFY_DONE; if (!dev->ieee80211_ptr || !dev->ieee80211_ptr->wiphy) return NOTIFY_DONE; if (dev->ieee80211_ptr->wiphy->privid != mac80211_wiphy_privid) return NOTIFY_DONE; sdata = IEEE80211_DEV_TO_SUB_IF(dev); memcpy(sdata->name, dev->name, IFNAMSIZ); ieee80211_debugfs_rename_netdev(sdata); return NOTIFY_OK; } static struct notifier_block mac80211_netdev_notifier = { .notifier_call = netdev_notify, }; int ieee80211_iface_init(void) { return register_netdevice_notifier(&mac80211_netdev_notifier); } void ieee80211_iface_exit(void) { unregister_netdevice_notifier(&mac80211_netdev_notifier); } void ieee80211_vif_inc_num_mcast(struct ieee80211_sub_if_data *sdata) { if (sdata->vif.type == NL80211_IFTYPE_AP) atomic_inc(&sdata->u.ap.num_mcast_sta); else if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) atomic_inc(&sdata->u.vlan.num_mcast_sta); } void ieee80211_vif_dec_num_mcast(struct ieee80211_sub_if_data *sdata) { if (sdata->vif.type == NL80211_IFTYPE_AP) atomic_dec(&sdata->u.ap.num_mcast_sta); else if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) atomic_dec(&sdata->u.vlan.num_mcast_sta); } |
14 14 14 14 91 5 86 9 54 84 55 54 5 5 55 55 84 84 84 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 | // SPDX-License-Identifier: GPL-2.0 /* * drivers/usb/core/file.c * * (C) Copyright Linus Torvalds 1999 * (C) Copyright Johannes Erdfelt 1999-2001 * (C) Copyright Andreas Gal 1999 * (C) Copyright Gregory P. Smith 1999 * (C) Copyright Deti Fliegl 1999 (new USB architecture) * (C) Copyright Randy Dunlap 2000 * (C) Copyright David Brownell 2000-2001 (kernel hotplug, usb_device_id, * more docs, etc) * (C) Copyright Yggdrasil Computing, Inc. 2000 * (usb_device_id matching changes by Adam J. Richter) * (C) Copyright Greg Kroah-Hartman 2002-2003 * * Released under the GPLv2 only. */ #include <linux/module.h> #include <linux/errno.h> #include <linux/rwsem.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/usb.h> #include "usb.h" #define MAX_USB_MINORS 256 static const struct file_operations *usb_minors[MAX_USB_MINORS]; static DECLARE_RWSEM(minor_rwsem); static DEFINE_MUTEX(init_usb_class_mutex); static int usb_open(struct inode *inode, struct file *file) { int err = -ENODEV; const struct file_operations *new_fops; down_read(&minor_rwsem); new_fops = fops_get(usb_minors[iminor(inode)]); if (!new_fops) goto done; replace_fops(file, new_fops); /* Curiouser and curiouser... NULL ->open() as "no device" ? */ if (file->f_op->open) err = file->f_op->open(inode, file); done: up_read(&minor_rwsem); return err; } static const struct file_operations usb_fops = { .owner = THIS_MODULE, .open = usb_open, .llseek = noop_llseek, }; static struct usb_class { struct kref kref; struct class *class; } *usb_class; static char *usb_devnode(struct device *dev, umode_t *mode) { struct usb_class_driver *drv; drv = dev_get_drvdata(dev); if (!drv || !drv->devnode) return NULL; return drv->devnode(dev, mode); } static int init_usb_class(void) { int result = 0; if (usb_class != NULL) { kref_get(&usb_class->kref); goto exit; } usb_class = kmalloc(sizeof(*usb_class), GFP_KERNEL); if (!usb_class) { result = -ENOMEM; goto exit; } kref_init(&usb_class->kref); usb_class->class = class_create(THIS_MODULE, "usbmisc"); if (IS_ERR(usb_class->class)) { result = PTR_ERR(usb_class->class); printk(KERN_ERR "class_create failed for usb devices\n"); kfree(usb_class); usb_class = NULL; goto exit; } usb_class->class->devnode = usb_devnode; exit: return result; } static void release_usb_class(struct kref *kref) { /* Ok, we cheat as we know we only have one usb_class */ class_destroy(usb_class->class); kfree(usb_class); usb_class = NULL; } static void destroy_usb_class(void) { mutex_lock(&init_usb_class_mutex); kref_put(&usb_class->kref, release_usb_class); mutex_unlock(&init_usb_class_mutex); } int usb_major_init(void) { int error; error = register_chrdev(USB_MAJOR, "usb", &usb_fops); if (error) printk(KERN_ERR "Unable to get major %d for usb devices\n", USB_MAJOR); return error; } void usb_major_cleanup(void) { unregister_chrdev(USB_MAJOR, "usb"); } /** * usb_register_dev - register a USB device, and ask for a minor number * @intf: pointer to the usb_interface that is being registered * @class_driver: pointer to the usb_class_driver for this device * * This should be called by all USB drivers that use the USB major number. * If CONFIG_USB_DYNAMIC_MINORS is enabled, the minor number will be * dynamically allocated out of the list of available ones. If it is not * enabled, the minor number will be based on the next available free minor, * starting at the class_driver->minor_base. * * This function also creates a usb class device in the sysfs tree. * * usb_deregister_dev() must be called when the driver is done with * the minor numbers given out by this function. * * Return: -EINVAL if something bad happens with trying to register a * device, and 0 on success. */ int usb_register_dev(struct usb_interface *intf, struct usb_class_driver *class_driver) { int retval; int minor_base = class_driver->minor_base; int minor; char name[20]; #ifdef CONFIG_USB_DYNAMIC_MINORS /* * We don't care what the device tries to start at, we want to start * at zero to pack the devices into the smallest available space with * no holes in the minor range. */ minor_base = 0; #endif if (class_driver->fops == NULL) return -EINVAL; if (intf->minor >= 0) return -EADDRINUSE; mutex_lock(&init_usb_class_mutex); retval = init_usb_class(); mutex_unlock(&init_usb_class_mutex); if (retval) return retval; dev_dbg(&intf->dev, "looking for a minor, starting at %d\n", minor_base); down_write(&minor_rwsem); for (minor = minor_base; minor < MAX_USB_MINORS; ++minor) { if (usb_minors[minor]) continue; usb_minors[minor] = class_driver->fops; intf->minor = minor; break; } if (intf->minor < 0) { up_write(&minor_rwsem); return -EXFULL; } /* create a usb class device for this usb interface */ snprintf(name, sizeof(name), class_driver->name, minor - minor_base); intf->usb_dev = device_create(usb_class->class, &intf->dev, MKDEV(USB_MAJOR, minor), class_driver, "%s", kbasename(name)); if (IS_ERR(intf->usb_dev)) { usb_minors[minor] = NULL; intf->minor = -1; retval = PTR_ERR(intf->usb_dev); } up_write(&minor_rwsem); return retval; } EXPORT_SYMBOL_GPL(usb_register_dev); /** * usb_deregister_dev - deregister a USB device's dynamic minor. * @intf: pointer to the usb_interface that is being deregistered * @class_driver: pointer to the usb_class_driver for this device * * Used in conjunction with usb_register_dev(). This function is called * when the USB driver is finished with the minor numbers gotten from a * call to usb_register_dev() (usually when the device is disconnected * from the system.) * * This function also removes the usb class device from the sysfs tree. * * This should be called by all drivers that use the USB major number. */ void usb_deregister_dev(struct usb_interface *intf, struct usb_class_driver *class_driver) { if (intf->minor == -1) return; dev_dbg(&intf->dev, "removing %d minor\n", intf->minor); device_destroy(usb_class->class, MKDEV(USB_MAJOR, intf->minor)); down_write(&minor_rwsem); usb_minors[intf->minor] = NULL; up_write(&minor_rwsem); intf->usb_dev = NULL; intf->minor = -1; destroy_usb_class(); } EXPORT_SYMBOL_GPL(usb_deregister_dev); |
89 22 22 252 80 22 22 269 261 89 80 22 269 122 8 139 269 22 260 19 19 19 14 3 12 5 5 141 141 137 28 28 28 4 25 86 55 184 178 60 57 5 58 4 60 5 5 5 1 4 5 1 1 1 1 1 2 3 1 3 3 2 2 2 2 2 2 2 2 2 2 2 3 1 1 1 1 425 425 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 | // SPDX-License-Identifier: GPL-2.0 #include <linux/rcupdate.h> #include <linux/spinlock.h> #include <linux/jiffies.h> #include <linux/module.h> #include <linux/cache.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/tcp.h> #include <linux/hash.h> #include <linux/tcp_metrics.h> #include <linux/vmalloc.h> #include <net/inet_connection_sock.h> #include <net/net_namespace.h> #include <net/request_sock.h> #include <net/inetpeer.h> #include <net/sock.h> #include <net/ipv6.h> #include <net/dst.h> #include <net/tcp.h> #include <net/genetlink.h> static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr, const struct inetpeer_addr *daddr, struct net *net, unsigned int hash); struct tcp_fastopen_metrics { u16 mss; u16 syn_loss:10, /* Recurring Fast Open SYN losses */ try_exp:2; /* Request w/ exp. option (once) */ unsigned long last_syn_loss; /* Last Fast Open SYN loss */ struct tcp_fastopen_cookie cookie; }; /* TCP_METRIC_MAX includes 2 extra fields for userspace compatibility * Kernel only stores RTT and RTTVAR in usec resolution */ #define TCP_METRIC_MAX_KERNEL (TCP_METRIC_MAX - 2) struct tcp_metrics_block { struct tcp_metrics_block __rcu *tcpm_next; struct net *tcpm_net; struct inetpeer_addr tcpm_saddr; struct inetpeer_addr tcpm_daddr; unsigned long tcpm_stamp; u32 tcpm_lock; u32 tcpm_vals[TCP_METRIC_MAX_KERNEL + 1]; struct tcp_fastopen_metrics tcpm_fastopen; struct rcu_head rcu_head; }; static inline struct net *tm_net(const struct tcp_metrics_block *tm) { /* Paired with the WRITE_ONCE() in tcpm_new() */ return READ_ONCE(tm->tcpm_net); } static bool tcp_metric_locked(struct tcp_metrics_block *tm, enum tcp_metric_index idx) { /* Paired with WRITE_ONCE() in tcpm_suck_dst() */ return READ_ONCE(tm->tcpm_lock) & (1 << idx); } static u32 tcp_metric_get(const struct tcp_metrics_block *tm, enum tcp_metric_index idx) { /* Paired with WRITE_ONCE() in tcp_metric_set() */ return READ_ONCE(tm->tcpm_vals[idx]); } static void tcp_metric_set(struct tcp_metrics_block *tm, enum tcp_metric_index idx, u32 val) { /* Paired with READ_ONCE() in tcp_metric_get() */ WRITE_ONCE(tm->tcpm_vals[idx], val); } static bool addr_same(const struct inetpeer_addr *a, const struct inetpeer_addr *b) { return (a->family == b->family) && !inetpeer_addr_cmp(a, b); } struct tcpm_hash_bucket { struct tcp_metrics_block __rcu *chain; }; static struct tcpm_hash_bucket *tcp_metrics_hash __read_mostly; static unsigned int tcp_metrics_hash_log __read_mostly; static DEFINE_SPINLOCK(tcp_metrics_lock); static DEFINE_SEQLOCK(fastopen_seqlock); static void tcpm_suck_dst(struct tcp_metrics_block *tm, const struct dst_entry *dst, bool fastopen_clear) { u32 msval; u32 val; WRITE_ONCE(tm->tcpm_stamp, jiffies); val = 0; if (dst_metric_locked(dst, RTAX_RTT)) val |= 1 << TCP_METRIC_RTT; if (dst_metric_locked(dst, RTAX_RTTVAR)) val |= 1 << TCP_METRIC_RTTVAR; if (dst_metric_locked(dst, RTAX_SSTHRESH)) val |= 1 << TCP_METRIC_SSTHRESH; if (dst_metric_locked(dst, RTAX_CWND)) val |= 1 << TCP_METRIC_CWND; if (dst_metric_locked(dst, RTAX_REORDERING)) val |= 1 << TCP_METRIC_REORDERING; /* Paired with READ_ONCE() in tcp_metric_locked() */ WRITE_ONCE(tm->tcpm_lock, val); msval = dst_metric_raw(dst, RTAX_RTT); tcp_metric_set(tm, TCP_METRIC_RTT, msval * USEC_PER_MSEC); msval = dst_metric_raw(dst, RTAX_RTTVAR); tcp_metric_set(tm, TCP_METRIC_RTTVAR, msval * USEC_PER_MSEC); tcp_metric_set(tm, TCP_METRIC_SSTHRESH, dst_metric_raw(dst, RTAX_SSTHRESH)); tcp_metric_set(tm, TCP_METRIC_CWND, dst_metric_raw(dst, RTAX_CWND)); tcp_metric_set(tm, TCP_METRIC_REORDERING, dst_metric_raw(dst, RTAX_REORDERING)); if (fastopen_clear) { write_seqlock(&fastopen_seqlock); tm->tcpm_fastopen.mss = 0; tm->tcpm_fastopen.syn_loss = 0; tm->tcpm_fastopen.try_exp = 0; tm->tcpm_fastopen.cookie.exp = false; tm->tcpm_fastopen.cookie.len = 0; write_sequnlock(&fastopen_seqlock); } } #define TCP_METRICS_TIMEOUT (60 * 60 * HZ) static void tcpm_check_stamp(struct tcp_metrics_block *tm, const struct dst_entry *dst) { unsigned long limit; if (!tm) return; limit = READ_ONCE(tm->tcpm_stamp) + TCP_METRICS_TIMEOUT; if (unlikely(time_after(jiffies, limit))) tcpm_suck_dst(tm, dst, false); } #define TCP_METRICS_RECLAIM_DEPTH 5 #define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL #define deref_locked(p) \ rcu_dereference_protected(p, lockdep_is_held(&tcp_metrics_lock)) static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, struct inetpeer_addr *saddr, struct inetpeer_addr *daddr, unsigned int hash) { struct tcp_metrics_block *tm; struct net *net; bool reclaim = false; spin_lock_bh(&tcp_metrics_lock); net = dev_net(dst->dev); /* While waiting for the spin-lock the cache might have been populated * with this entry and so we have to check again. */ tm = __tcp_get_metrics(saddr, daddr, net, hash); if (tm == TCP_METRICS_RECLAIM_PTR) { reclaim = true; tm = NULL; } if (tm) { tcpm_check_stamp(tm, dst); goto out_unlock; } if (unlikely(reclaim)) { struct tcp_metrics_block *oldest; oldest = deref_locked(tcp_metrics_hash[hash].chain); for (tm = deref_locked(oldest->tcpm_next); tm; tm = deref_locked(tm->tcpm_next)) { if (time_before(READ_ONCE(tm->tcpm_stamp), READ_ONCE(oldest->tcpm_stamp))) oldest = tm; } tm = oldest; } else { tm = kzalloc(sizeof(*tm), GFP_ATOMIC); if (!tm) goto out_unlock; } /* Paired with the READ_ONCE() in tm_net() */ WRITE_ONCE(tm->tcpm_net, net); tm->tcpm_saddr = *saddr; tm->tcpm_daddr = *daddr; tcpm_suck_dst(tm, dst, reclaim); if (likely(!reclaim)) { tm->tcpm_next = tcp_metrics_hash[hash].chain; rcu_assign_pointer(tcp_metrics_hash[hash].chain, tm); } out_unlock: spin_unlock_bh(&tcp_metrics_lock); return tm; } static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth) { if (tm) return tm; if (depth > TCP_METRICS_RECLAIM_DEPTH) return TCP_METRICS_RECLAIM_PTR; return NULL; } static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr, const struct inetpeer_addr *daddr, struct net *net, unsigned int hash) { struct tcp_metrics_block *tm; int depth = 0; for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { if (addr_same(&tm->tcpm_saddr, saddr) && addr_same(&tm->tcpm_daddr, daddr) && net_eq(tm_net(tm), net)) break; depth++; } return tcp_get_encode(tm, depth); } static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, struct dst_entry *dst) { struct tcp_metrics_block *tm; struct inetpeer_addr saddr, daddr; unsigned int hash; struct net *net; saddr.family = req->rsk_ops->family; daddr.family = req->rsk_ops->family; switch (daddr.family) { case AF_INET: inetpeer_set_addr_v4(&saddr, inet_rsk(req)->ir_loc_addr); inetpeer_set_addr_v4(&daddr, inet_rsk(req)->ir_rmt_addr); hash = ipv4_addr_hash(inet_rsk(req)->ir_rmt_addr); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: inetpeer_set_addr_v6(&saddr, &inet_rsk(req)->ir_v6_loc_addr); inetpeer_set_addr_v6(&daddr, &inet_rsk(req)->ir_v6_rmt_addr); hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr); break; #endif default: return NULL; } net = dev_net(dst->dev); hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { if (addr_same(&tm->tcpm_saddr, &saddr) && addr_same(&tm->tcpm_daddr, &daddr) && net_eq(tm_net(tm), net)) break; } tcpm_check_stamp(tm, dst); return tm; } static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, struct dst_entry *dst, bool create) { struct tcp_metrics_block *tm; struct inetpeer_addr saddr, daddr; unsigned int hash; struct net *net; if (sk->sk_family == AF_INET) { inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr); inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr); hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr); } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) { inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr); inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr); hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr); } else { inetpeer_set_addr_v6(&saddr, &sk->sk_v6_rcv_saddr); inetpeer_set_addr_v6(&daddr, &sk->sk_v6_daddr); hash = ipv6_addr_hash(&sk->sk_v6_daddr); } } #endif else return NULL; net = dev_net(dst->dev); hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); tm = __tcp_get_metrics(&saddr, &daddr, net, hash); if (tm == TCP_METRICS_RECLAIM_PTR) tm = NULL; if (!tm && create) tm = tcpm_new(dst, &saddr, &daddr, hash); else tcpm_check_stamp(tm, dst); return tm; } /* Save metrics learned by this TCP session. This function is called * only, when TCP finishes successfully i.e. when it enters TIME-WAIT * or goes from LAST-ACK to CLOSE. */ void tcp_update_metrics(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct tcp_metrics_block *tm; unsigned long rtt; u32 val; int m; sk_dst_confirm(sk); if (READ_ONCE(net->ipv4.sysctl_tcp_nometrics_save) || !dst) return; rcu_read_lock(); if (icsk->icsk_backoff || !tp->srtt_us) { /* This session failed to estimate rtt. Why? * Probably, no packets returned in time. Reset our * results. */ tm = tcp_get_metrics(sk, dst, false); if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT)) tcp_metric_set(tm, TCP_METRIC_RTT, 0); goto out_unlock; } else tm = tcp_get_metrics(sk, dst, true); if (!tm) goto out_unlock; rtt = tcp_metric_get(tm, TCP_METRIC_RTT); m = rtt - tp->srtt_us; /* If newly calculated rtt larger than stored one, store new * one. Otherwise, use EWMA. Remember, rtt overestimation is * always better than underestimation. */ if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) { if (m <= 0) rtt = tp->srtt_us; else rtt -= (m >> 3); tcp_metric_set(tm, TCP_METRIC_RTT, rtt); } if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) { unsigned long var; if (m < 0) m = -m; /* Scale deviation to rttvar fixed point */ m >>= 1; if (m < tp->mdev_us) m = tp->mdev_us; var = tcp_metric_get(tm, TCP_METRIC_RTTVAR); if (m >= var) var = m; else var -= (var - m) >> 2; tcp_metric_set(tm, TCP_METRIC_RTTVAR, var); } if (tcp_in_initial_slowstart(tp)) { /* Slow start still did not finish. */ if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) && !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val && (tcp_snd_cwnd(tp) >> 1) > val) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, tcp_snd_cwnd(tp) >> 1); } if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { val = tcp_metric_get(tm, TCP_METRIC_CWND); if (tcp_snd_cwnd(tp) > val) tcp_metric_set(tm, TCP_METRIC_CWND, tcp_snd_cwnd(tp)); } } else if (!tcp_in_slow_start(tp) && icsk->icsk_ca_state == TCP_CA_Open) { /* Cong. avoidance phase, cwnd is reliable. */ if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) && !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, max(tcp_snd_cwnd(tp) >> 1, tp->snd_ssthresh)); if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { val = tcp_metric_get(tm, TCP_METRIC_CWND); tcp_metric_set(tm, TCP_METRIC_CWND, (val + tcp_snd_cwnd(tp)) >> 1); } } else { /* Else slow start did not finish, cwnd is non-sense, * ssthresh may be also invalid. */ if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { val = tcp_metric_get(tm, TCP_METRIC_CWND); tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_ssthresh) >> 1); } if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) && !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val && tp->snd_ssthresh > val) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, tp->snd_ssthresh); } if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) { val = tcp_metric_get(tm, TCP_METRIC_REORDERING); if (val < tp->reordering && tp->reordering != READ_ONCE(net->ipv4.sysctl_tcp_reordering)) tcp_metric_set(tm, TCP_METRIC_REORDERING, tp->reordering); } } WRITE_ONCE(tm->tcpm_stamp, jiffies); out_unlock: rcu_read_unlock(); } /* Initialize metrics on socket. */ void tcp_init_metrics(struct sock *sk) { struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct tcp_metrics_block *tm; u32 val, crtt = 0; /* cached RTT scaled by 8 */ sk_dst_confirm(sk); /* ssthresh may have been reduced unnecessarily during. * 3WHS. Restore it back to its initial default. */ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; if (!dst) goto reset; rcu_read_lock(); tm = tcp_get_metrics(sk, dst, false); if (!tm) { rcu_read_unlock(); goto reset; } if (tcp_metric_locked(tm, TCP_METRIC_CWND)) tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND); val = READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) ? 0 : tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val) { tp->snd_ssthresh = val; if (tp->snd_ssthresh > tp->snd_cwnd_clamp) tp->snd_ssthresh = tp->snd_cwnd_clamp; } val = tcp_metric_get(tm, TCP_METRIC_REORDERING); if (val && tp->reordering != val) tp->reordering = val; crtt = tcp_metric_get(tm, TCP_METRIC_RTT); rcu_read_unlock(); reset: /* The initial RTT measurement from the SYN/SYN-ACK is not ideal * to seed the RTO for later data packets because SYN packets are * small. Use the per-dst cached values to seed the RTO but keep * the RTT estimator variables intact (e.g., srtt, mdev, rttvar). * Later the RTO will be updated immediately upon obtaining the first * data RTT sample (tcp_rtt_estimator()). Hence the cached RTT only * influences the first RTO but not later RTT estimation. * * But if RTT is not available from the SYN (due to retransmits or * syn cookies) or the cache, force a conservative 3secs timeout. * * A bit of theory. RTT is time passed after "normal" sized packet * is sent until it is ACKed. In normal circumstances sending small * packets force peer to delay ACKs and calculation is correct too. * The algorithm is adaptive and, provided we follow specs, it * NEVER underestimate RTT. BUT! If peer tries to make some clever * tricks sort of "quick acks" for time long enough to decrease RTT * to low value, and then abruptly stops to do it and starts to delay * ACKs, wait for troubles. */ if (crtt > tp->srtt_us) { /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */ crtt /= 8 * USEC_PER_SEC / HZ; inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk)); } else if (tp->srtt_us == 0) { /* RFC6298: 5.7 We've failed to get a valid RTT sample from * 3WHS. This is most likely due to retransmission, * including spurious one. Reset the RTO back to 3secs * from the more aggressive 1sec to avoid more spurious * retransmission. */ tp->rttvar_us = jiffies_to_usecs(TCP_TIMEOUT_FALLBACK); tp->mdev_us = tp->mdev_max_us = tp->rttvar_us; inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; } } bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) { struct tcp_metrics_block *tm; bool ret; if (!dst) return false; rcu_read_lock(); tm = __tcp_get_metrics_req(req, dst); if (tm && tcp_metric_get(tm, TCP_METRIC_RTT)) ret = true; else ret = false; rcu_read_unlock(); return ret; } void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie) { struct tcp_metrics_block *tm; rcu_read_lock(); tm = tcp_get_metrics(sk, __sk_dst_get(sk), false); if (tm) { struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen; unsigned int seq; do { seq = read_seqbegin(&fastopen_seqlock); if (tfom->mss) *mss = tfom->mss; *cookie = tfom->cookie; if (cookie->len <= 0 && tfom->try_exp == 1) cookie->exp = true; } while (read_seqretry(&fastopen_seqlock, seq)); } rcu_read_unlock(); } void tcp_fastopen_cache_set(struct sock *sk, u16 mss, struct tcp_fastopen_cookie *cookie, bool syn_lost, u16 try_exp) { struct dst_entry *dst = __sk_dst_get(sk); struct tcp_metrics_block *tm; if (!dst) return; rcu_read_lock(); tm = tcp_get_metrics(sk, dst, true); if (tm) { struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen; write_seqlock_bh(&fastopen_seqlock); if (mss) tfom->mss = mss; if (cookie && cookie->len > 0) tfom->cookie = *cookie; else if (try_exp > tfom->try_exp && tfom->cookie.len <= 0 && !tfom->cookie.exp) tfom->try_exp = try_exp; if (syn_lost) { ++tfom->syn_loss; tfom->last_syn_loss = jiffies; } else tfom->syn_loss = 0; write_sequnlock_bh(&fastopen_seqlock); } rcu_read_unlock(); } static struct genl_family tcp_metrics_nl_family; static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = { [TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, }, [TCP_METRICS_ATTR_ADDR_IPV6] = { .type = NLA_BINARY, .len = sizeof(struct in6_addr), }, [TCP_METRICS_ATTR_SADDR_IPV4] = { .type = NLA_U32, }, /* Following attributes are not received for GET/DEL, * we keep them for reference */ #if 0 [TCP_METRICS_ATTR_AGE] = { .type = NLA_MSECS, }, [TCP_METRICS_ATTR_TW_TSVAL] = { .type = NLA_U32, }, [TCP_METRICS_ATTR_TW_TS_STAMP] = { .type = NLA_S32, }, [TCP_METRICS_ATTR_VALS] = { .type = NLA_NESTED, }, [TCP_METRICS_ATTR_FOPEN_MSS] = { .type = NLA_U16, }, [TCP_METRICS_ATTR_FOPEN_SYN_DROPS] = { .type = NLA_U16, }, [TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS] = { .type = NLA_MSECS, }, [TCP_METRICS_ATTR_FOPEN_COOKIE] = { .type = NLA_BINARY, .len = TCP_FASTOPEN_COOKIE_MAX, }, #endif }; /* Add attributes, caller cancels its header on failure */ static int tcp_metrics_fill_info(struct sk_buff *msg, struct tcp_metrics_block *tm) { struct nlattr *nest; int i; switch (tm->tcpm_daddr.family) { case AF_INET: if (nla_put_in_addr(msg, TCP_METRICS_ATTR_ADDR_IPV4, inetpeer_get_addr_v4(&tm->tcpm_daddr)) < 0) goto nla_put_failure; if (nla_put_in_addr(msg, TCP_METRICS_ATTR_SADDR_IPV4, inetpeer_get_addr_v4(&tm->tcpm_saddr)) < 0) goto nla_put_failure; break; case AF_INET6: if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_ADDR_IPV6, inetpeer_get_addr_v6(&tm->tcpm_daddr)) < 0) goto nla_put_failure; if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_SADDR_IPV6, inetpeer_get_addr_v6(&tm->tcpm_saddr)) < 0) goto nla_put_failure; break; default: return -EAFNOSUPPORT; } if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE, jiffies - READ_ONCE(tm->tcpm_stamp), TCP_METRICS_ATTR_PAD) < 0) goto nla_put_failure; { int n = 0; nest = nla_nest_start_noflag(msg, TCP_METRICS_ATTR_VALS); if (!nest) goto nla_put_failure; for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) { u32 val = tcp_metric_get(tm, i); if (!val) continue; if (i == TCP_METRIC_RTT) { if (nla_put_u32(msg, TCP_METRIC_RTT_US + 1, val) < 0) goto nla_put_failure; n++; val = max(val / 1000, 1U); } if (i == TCP_METRIC_RTTVAR) { if (nla_put_u32(msg, TCP_METRIC_RTTVAR_US + 1, val) < 0) goto nla_put_failure; n++; val = max(val / 1000, 1U); } if (nla_put_u32(msg, i + 1, val) < 0) goto nla_put_failure; n++; } if (n) nla_nest_end(msg, nest); else nla_nest_cancel(msg, nest); } { struct tcp_fastopen_metrics tfom_copy[1], *tfom; unsigned int seq; do { seq = read_seqbegin(&fastopen_seqlock); tfom_copy[0] = tm->tcpm_fastopen; } while (read_seqretry(&fastopen_seqlock, seq)); tfom = tfom_copy; if (tfom->mss && nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_MSS, tfom->mss) < 0) goto nla_put_failure; if (tfom->syn_loss && (nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS, tfom->syn_loss) < 0 || nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS, jiffies - tfom->last_syn_loss, TCP_METRICS_ATTR_PAD) < 0)) goto nla_put_failure; if (tfom->cookie.len > 0 && nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE, tfom->cookie.len, tfom->cookie.val) < 0) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } static int tcp_metrics_dump_info(struct sk_buff *skb, struct netlink_callback *cb, struct tcp_metrics_block *tm) { void *hdr; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &tcp_metrics_nl_family, NLM_F_MULTI, TCP_METRICS_CMD_GET); if (!hdr) return -EMSGSIZE; if (tcp_metrics_fill_info(skb, tm) < 0) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int tcp_metrics_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); unsigned int max_rows = 1U << tcp_metrics_hash_log; unsigned int row, s_row = cb->args[0]; int s_col = cb->args[1], col = s_col; for (row = s_row; row < max_rows; row++, s_col = 0) { struct tcp_metrics_block *tm; struct tcpm_hash_bucket *hb = tcp_metrics_hash + row; rcu_read_lock(); for (col = 0, tm = rcu_dereference(hb->chain); tm; tm = rcu_dereference(tm->tcpm_next), col++) { if (!net_eq(tm_net(tm), net)) continue; if (col < s_col) continue; if (tcp_metrics_dump_info(skb, cb, tm) < 0) { rcu_read_unlock(); goto done; } } rcu_read_unlock(); } done: cb->args[0] = row; cb->args[1] = col; return skb->len; } static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, unsigned int *hash, int optional, int v4, int v6) { struct nlattr *a; a = info->attrs[v4]; if (a) { inetpeer_set_addr_v4(addr, nla_get_in_addr(a)); if (hash) *hash = ipv4_addr_hash(inetpeer_get_addr_v4(addr)); return 0; } a = info->attrs[v6]; if (a) { struct in6_addr in6; if (nla_len(a) != sizeof(struct in6_addr)) return -EINVAL; in6 = nla_get_in6_addr(a); inetpeer_set_addr_v6(addr, &in6); if (hash) *hash = ipv6_addr_hash(inetpeer_get_addr_v6(addr)); return 0; } return optional ? 1 : -EAFNOSUPPORT; } static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, unsigned int *hash, int optional) { return __parse_nl_addr(info, addr, hash, optional, TCP_METRICS_ATTR_ADDR_IPV4, TCP_METRICS_ATTR_ADDR_IPV6); } static int parse_nl_saddr(struct genl_info *info, struct inetpeer_addr *addr) { return __parse_nl_addr(info, addr, NULL, 0, TCP_METRICS_ATTR_SADDR_IPV4, TCP_METRICS_ATTR_SADDR_IPV6); } static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct tcp_metrics_block *tm; struct inetpeer_addr saddr, daddr; unsigned int hash; struct sk_buff *msg; struct net *net = genl_info_net(info); void *reply; int ret; bool src = true; ret = parse_nl_addr(info, &daddr, &hash, 0); if (ret < 0) return ret; ret = parse_nl_saddr(info, &saddr); if (ret < 0) src = false; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; reply = genlmsg_put_reply(msg, info, &tcp_metrics_nl_family, 0, info->genlhdr->cmd); if (!reply) goto nla_put_failure; hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); ret = -ESRCH; rcu_read_lock(); for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { if (addr_same(&tm->tcpm_daddr, &daddr) && (!src || addr_same(&tm->tcpm_saddr, &saddr)) && net_eq(tm_net(tm), net)) { ret = tcp_metrics_fill_info(msg, tm); break; } } rcu_read_unlock(); if (ret < 0) goto out_free; genlmsg_end(msg, reply); return genlmsg_reply(msg, info); nla_put_failure: ret = -EMSGSIZE; out_free: nlmsg_free(msg); return ret; } static void tcp_metrics_flush_all(struct net *net) { unsigned int max_rows = 1U << tcp_metrics_hash_log; struct tcpm_hash_bucket *hb = tcp_metrics_hash; struct tcp_metrics_block *tm; unsigned int row; for (row = 0; row < max_rows; row++, hb++) { struct tcp_metrics_block __rcu **pp; bool match; spin_lock_bh(&tcp_metrics_lock); pp = &hb->chain; for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { match = net ? net_eq(tm_net(tm), net) : !refcount_read(&tm_net(tm)->ns.count); if (match) { rcu_assign_pointer(*pp, tm->tcpm_next); kfree_rcu(tm, rcu_head); } else { pp = &tm->tcpm_next; } } spin_unlock_bh(&tcp_metrics_lock); } } static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) { struct tcpm_hash_bucket *hb; struct tcp_metrics_block *tm; struct tcp_metrics_block __rcu **pp; struct inetpeer_addr saddr, daddr; unsigned int hash; struct net *net = genl_info_net(info); int ret; bool src = true, found = false; ret = parse_nl_addr(info, &daddr, &hash, 1); if (ret < 0) return ret; if (ret > 0) { tcp_metrics_flush_all(net); return 0; } ret = parse_nl_saddr(info, &saddr); if (ret < 0) src = false; hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); hb = tcp_metrics_hash + hash; pp = &hb->chain; spin_lock_bh(&tcp_metrics_lock); for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { if (addr_same(&tm->tcpm_daddr, &daddr) && (!src || addr_same(&tm->tcpm_saddr, &saddr)) && net_eq(tm_net(tm), net)) { rcu_assign_pointer(*pp, tm->tcpm_next); kfree_rcu(tm, rcu_head); found = true; } else { pp = &tm->tcpm_next; } } spin_unlock_bh(&tcp_metrics_lock); if (!found) return -ESRCH; return 0; } static const struct genl_small_ops tcp_metrics_nl_ops[] = { { .cmd = TCP_METRICS_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = tcp_metrics_nl_cmd_get, .dumpit = tcp_metrics_nl_dump, }, { .cmd = TCP_METRICS_CMD_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = tcp_metrics_nl_cmd_del, .flags = GENL_ADMIN_PERM, }, }; static struct genl_family tcp_metrics_nl_family __ro_after_init = { .hdrsize = 0, .name = TCP_METRICS_GENL_NAME, .version = TCP_METRICS_GENL_VERSION, .maxattr = TCP_METRICS_ATTR_MAX, .policy = tcp_metrics_nl_policy, .netnsok = true, .module = THIS_MODULE, .small_ops = tcp_metrics_nl_ops, .n_small_ops = ARRAY_SIZE(tcp_metrics_nl_ops), .resv_start_op = TCP_METRICS_CMD_DEL + 1, }; static unsigned int tcpmhash_entries; static int __init set_tcpmhash_entries(char *str) { ssize_t ret; if (!str) return 0; ret = kstrtouint(str, 0, &tcpmhash_entries); if (ret) return 0; return 1; } __setup("tcpmhash_entries=", set_tcpmhash_entries); static int __net_init tcp_net_metrics_init(struct net *net) { size_t size; unsigned int slots; if (!net_eq(net, &init_net)) return 0; slots = tcpmhash_entries; if (!slots) { if (totalram_pages() >= 128 * 1024) slots = 16 * 1024; else slots = 8 * 1024; } tcp_metrics_hash_log = order_base_2(slots); size = sizeof(struct tcpm_hash_bucket) << tcp_metrics_hash_log; tcp_metrics_hash = kvzalloc(size, GFP_KERNEL); if (!tcp_metrics_hash) return -ENOMEM; return 0; } static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_list) { tcp_metrics_flush_all(NULL); } static __net_initdata struct pernet_operations tcp_net_metrics_ops = { .init = tcp_net_metrics_init, .exit_batch = tcp_net_metrics_exit_batch, }; void __init tcp_metrics_init(void) { int ret; ret = register_pernet_subsys(&tcp_net_metrics_ops); if (ret < 0) panic("Could not allocate the tcp_metrics hash table\n"); ret = genl_register_family(&tcp_metrics_nl_family); if (ret < 0) panic("Could not register tcp_metrics generic netlink\n"); } |
419 6 425 425 425 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 | // SPDX-License-Identifier: GPL-2.0-or-later /* 6LoWPAN fragment reassembly * * Authors: * Alexander Aring <aar@pengutronix.de> * * Based on: net/ipv6/reassembly.c */ #define pr_fmt(fmt) "6LoWPAN: " fmt #include <linux/net.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/random.h> #include <linux/jhash.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/export.h> #include <net/ieee802154_netdev.h> #include <net/6lowpan.h> #include <net/ipv6_frag.h> #include <net/inet_frag.h> #include <net/ip.h> #include "6lowpan_i.h" static const char lowpan_frags_cache_name[] = "lowpan-frags"; static struct inet_frags lowpan_frags; static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev, struct net_device *ldev); static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) { const struct frag_lowpan_compare_key *key = a; BUILD_BUG_ON(sizeof(*key) > sizeof(q->key)); memcpy(&q->key, key, sizeof(*key)); } static void lowpan_frag_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; fq = container_of(frag, struct frag_queue, q); spin_lock(&fq->q.lock); if (fq->q.flags & INET_FRAG_COMPLETE) goto out; inet_frag_kill(&fq->q); out: spin_unlock(&fq->q.lock); inet_frag_put(&fq->q); } static inline struct lowpan_frag_queue * fq_find(struct net *net, const struct lowpan_802154_cb *cb, const struct ieee802154_addr *src, const struct ieee802154_addr *dst) { struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); struct frag_lowpan_compare_key key = {}; struct inet_frag_queue *q; key.tag = cb->d_tag; key.d_size = cb->d_size; key.src = *src; key.dst = *dst; q = inet_frag_find(ieee802154_lowpan->fqdir, &key); if (!q) return NULL; return container_of(q, struct lowpan_frag_queue, q); } static int lowpan_frag_queue(struct lowpan_frag_queue *fq, struct sk_buff *skb, u8 frag_type) { struct sk_buff *prev_tail; struct net_device *ldev; int end, offset, err; /* inet_frag_queue_* functions use skb->cb; see struct ipfrag_skb_cb * in inet_fragment.c */ BUILD_BUG_ON(sizeof(struct lowpan_802154_cb) > sizeof(struct inet_skb_parm)); BUILD_BUG_ON(sizeof(struct lowpan_802154_cb) > sizeof(struct inet6_skb_parm)); if (fq->q.flags & INET_FRAG_COMPLETE) goto err; offset = lowpan_802154_cb(skb)->d_offset << 3; end = lowpan_802154_cb(skb)->d_size; /* Is this the final fragment? */ if (offset + skb->len == end) { /* If we already have some bits beyond end * or have different end, the segment is corrupted. */ if (end < fq->q.len || ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len)) goto err; fq->q.flags |= INET_FRAG_LAST_IN; fq->q.len = end; } else { if (end > fq->q.len) { /* Some bits beyond end -> corruption. */ if (fq->q.flags & INET_FRAG_LAST_IN) goto err; fq->q.len = end; } } ldev = skb->dev; if (ldev) skb->dev = NULL; barrier(); prev_tail = fq->q.fragments_tail; err = inet_frag_queue_insert(&fq->q, skb, offset, end); if (err) goto err; fq->q.stamp = skb->tstamp; fq->q.mono_delivery_time = skb->mono_delivery_time; if (frag_type == LOWPAN_DISPATCH_FRAG1) fq->q.flags |= INET_FRAG_FIRST_IN; fq->q.meat += skb->len; add_frag_mem_limit(fq->q.fqdir, skb->truesize); if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && fq->q.meat == fq->q.len) { int res; unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; res = lowpan_frag_reasm(fq, skb, prev_tail, ldev); skb->_skb_refdst = orefdst; return res; } skb_dst_drop(skb); return -1; err: kfree_skb(skb); return -1; } /* Check if this packet is complete. * * It is called with locked fq, and caller must check that * queue is eligible for reassembly i.e. it is not COMPLETE, * the last and the first frames arrived and all the bits are here. */ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *ldev) { void *reasm_data; inet_frag_kill(&fq->q); reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail); if (!reasm_data) goto out_oom; inet_frag_reasm_finish(&fq->q, skb, reasm_data, false); skb->dev = ldev; skb->tstamp = fq->q.stamp; fq->q.rb_fragments = RB_ROOT; fq->q.fragments_tail = NULL; fq->q.last_run_head = NULL; return 1; out_oom: net_dbg_ratelimited("lowpan_frag_reasm: no memory for reassembly\n"); return -1; } static int lowpan_frag_rx_handlers_result(struct sk_buff *skb, lowpan_rx_result res) { switch (res) { case RX_QUEUED: return NET_RX_SUCCESS; case RX_CONTINUE: /* nobody cared about this packet */ net_warn_ratelimited("%s: received unknown dispatch\n", __func__); fallthrough; default: /* all others failure */ return NET_RX_DROP; } } static lowpan_rx_result lowpan_frag_rx_h_iphc(struct sk_buff *skb) { int ret; if (!lowpan_is_iphc(*skb_network_header(skb))) return RX_CONTINUE; ret = lowpan_iphc_decompress(skb); if (ret < 0) return RX_DROP; return RX_QUEUED; } static int lowpan_invoke_frag_rx_handlers(struct sk_buff *skb) { lowpan_rx_result res; #define CALL_RXH(rxh) \ do { \ res = rxh(skb); \ if (res != RX_CONTINUE) \ goto rxh_next; \ } while (0) /* likely at first */ CALL_RXH(lowpan_frag_rx_h_iphc); CALL_RXH(lowpan_rx_h_ipv6); rxh_next: return lowpan_frag_rx_handlers_result(skb, res); #undef CALL_RXH } #define LOWPAN_FRAG_DGRAM_SIZE_HIGH_MASK 0x07 #define LOWPAN_FRAG_DGRAM_SIZE_HIGH_SHIFT 8 static int lowpan_get_cb(struct sk_buff *skb, u8 frag_type, struct lowpan_802154_cb *cb) { bool fail; u8 high = 0, low = 0; __be16 d_tag = 0; fail = lowpan_fetch_skb(skb, &high, 1); fail |= lowpan_fetch_skb(skb, &low, 1); /* remove the dispatch value and use first three bits as high value * for the datagram size */ cb->d_size = (high & LOWPAN_FRAG_DGRAM_SIZE_HIGH_MASK) << LOWPAN_FRAG_DGRAM_SIZE_HIGH_SHIFT | low; fail |= lowpan_fetch_skb(skb, &d_tag, 2); cb->d_tag = ntohs(d_tag); if (frag_type == LOWPAN_DISPATCH_FRAGN) { fail |= lowpan_fetch_skb(skb, &cb->d_offset, 1); } else { skb_reset_network_header(skb); cb->d_offset = 0; /* check if datagram_size has ipv6hdr on FRAG1 */ fail |= cb->d_size < sizeof(struct ipv6hdr); /* check if we can dereference the dispatch value */ fail |= !skb->len; } if (unlikely(fail)) return -EIO; return 0; } int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type) { struct lowpan_frag_queue *fq; struct net *net = dev_net(skb->dev); struct lowpan_802154_cb *cb = lowpan_802154_cb(skb); struct ieee802154_hdr hdr = {}; int err; if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0) goto err; err = lowpan_get_cb(skb, frag_type, cb); if (err < 0) goto err; if (frag_type == LOWPAN_DISPATCH_FRAG1) { err = lowpan_invoke_frag_rx_handlers(skb); if (err == NET_RX_DROP) goto err; } if (cb->d_size > IPV6_MIN_MTU) { net_warn_ratelimited("lowpan_frag_rcv: datagram size exceeds MTU\n"); goto err; } fq = fq_find(net, cb, &hdr.source, &hdr.dest); if (fq != NULL) { int ret; spin_lock(&fq->q.lock); ret = lowpan_frag_queue(fq, skb, frag_type); spin_unlock(&fq->q.lock); inet_frag_put(&fq->q); return ret; } err: kfree_skb(skb); return -1; } #ifdef CONFIG_SYSCTL static struct ctl_table lowpan_frags_ns_ctl_table[] = { { .procname = "6lowpanfrag_high_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "6lowpanfrag_low_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "6lowpanfrag_time", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { } }; /* secret interval has been deprecated */ static int lowpan_frags_secret_interval_unused; static struct ctl_table lowpan_frags_ctl_table[] = { { .procname = "6lowpanfrag_secret_interval", .data = &lowpan_frags_secret_interval_unused, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { } }; static int __net_init lowpan_frags_ns_sysctl_register(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); table = lowpan_frags_ns_ctl_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(lowpan_frags_ns_ctl_table), GFP_KERNEL); if (table == NULL) goto err_alloc; /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) table[0].procname = NULL; } table[0].data = &ieee802154_lowpan->fqdir->high_thresh; table[0].extra1 = &ieee802154_lowpan->fqdir->low_thresh; table[1].data = &ieee802154_lowpan->fqdir->low_thresh; table[1].extra2 = &ieee802154_lowpan->fqdir->high_thresh; table[2].data = &ieee802154_lowpan->fqdir->timeout; hdr = register_net_sysctl(net, "net/ieee802154/6lowpan", table); if (hdr == NULL) goto err_reg; ieee802154_lowpan->sysctl.frags_hdr = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void __net_exit lowpan_frags_ns_sysctl_unregister(struct net *net) { struct ctl_table *table; struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); table = ieee802154_lowpan->sysctl.frags_hdr->ctl_table_arg; unregister_net_sysctl_table(ieee802154_lowpan->sysctl.frags_hdr); if (!net_eq(net, &init_net)) kfree(table); } static struct ctl_table_header *lowpan_ctl_header; static int __init lowpan_frags_sysctl_register(void) { lowpan_ctl_header = register_net_sysctl(&init_net, "net/ieee802154/6lowpan", lowpan_frags_ctl_table); return lowpan_ctl_header == NULL ? -ENOMEM : 0; } static void lowpan_frags_sysctl_unregister(void) { unregister_net_sysctl_table(lowpan_ctl_header); } #else static inline int lowpan_frags_ns_sysctl_register(struct net *net) { return 0; } static inline void lowpan_frags_ns_sysctl_unregister(struct net *net) { } static inline int __init lowpan_frags_sysctl_register(void) { return 0; } static inline void lowpan_frags_sysctl_unregister(void) { } #endif static int __net_init lowpan_frags_init_net(struct net *net) { struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); int res; res = fqdir_init(&ieee802154_lowpan->fqdir, &lowpan_frags, net); if (res < 0) return res; ieee802154_lowpan->fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; ieee802154_lowpan->fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; ieee802154_lowpan->fqdir->timeout = IPV6_FRAG_TIMEOUT; res = lowpan_frags_ns_sysctl_register(net); if (res < 0) fqdir_exit(ieee802154_lowpan->fqdir); return res; } static void __net_exit lowpan_frags_pre_exit_net(struct net *net) { struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); fqdir_pre_exit(ieee802154_lowpan->fqdir); } static void __net_exit lowpan_frags_exit_net(struct net *net) { struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); lowpan_frags_ns_sysctl_unregister(net); fqdir_exit(ieee802154_lowpan->fqdir); } static struct pernet_operations lowpan_frags_ops = { .init = lowpan_frags_init_net, .pre_exit = lowpan_frags_pre_exit_net, .exit = lowpan_frags_exit_net, }; static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed) { return jhash2(data, sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed); } static u32 lowpan_obj_hashfn(const void *data, u32 len, u32 seed) { const struct inet_frag_queue *fq = data; return jhash2((const u32 *)&fq->key, sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed); } static int lowpan_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) { const struct frag_lowpan_compare_key *key = arg->key; const struct inet_frag_queue *fq = ptr; return !!memcmp(&fq->key, key, sizeof(*key)); } static const struct rhashtable_params lowpan_rhash_params = { .head_offset = offsetof(struct inet_frag_queue, node), .hashfn = lowpan_key_hashfn, .obj_hashfn = lowpan_obj_hashfn, .obj_cmpfn = lowpan_obj_cmpfn, .automatic_shrinking = true, }; int __init lowpan_net_frag_init(void) { int ret; lowpan_frags.constructor = lowpan_frag_init; lowpan_frags.destructor = NULL; lowpan_frags.qsize = sizeof(struct frag_queue); lowpan_frags.frag_expire = lowpan_frag_expire; lowpan_frags.frags_cache_name = lowpan_frags_cache_name; lowpan_frags.rhash_params = lowpan_rhash_params; ret = inet_frags_init(&lowpan_frags); if (ret) goto out; ret = lowpan_frags_sysctl_register(); if (ret) goto err_sysctl; ret = register_pernet_subsys(&lowpan_frags_ops); if (ret) goto err_pernet; out: return ret; err_pernet: lowpan_frags_sysctl_unregister(); err_sysctl: inet_frags_fini(&lowpan_frags); return ret; } void lowpan_net_frag_exit(void) { lowpan_frags_sysctl_unregister(); unregister_pernet_subsys(&lowpan_frags_ops); inet_frags_fini(&lowpan_frags); } |
3313 41 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef INT_BLK_MQ_TAG_H #define INT_BLK_MQ_TAG_H struct blk_mq_alloc_data; extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, int alloc_policy); extern void blk_mq_free_tags(struct blk_mq_tags *tags); extern int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, struct sbitmap_queue *breserved_tags, unsigned int queue_depth, unsigned int reserved, int node, int alloc_policy); extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, unsigned int *offset); extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags); extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags **tags, unsigned int depth, bool can_grow); extern void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size); extern void blk_mq_tag_update_sched_shared_tags(struct request_queue *q); extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv); void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv); static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, struct blk_mq_hw_ctx *hctx) { if (!hctx) return &bt->ws[0]; return sbq_wait_ptr(bt, &hctx->wait_index); } enum { BLK_MQ_NO_TAG = -1U, BLK_MQ_TAG_MIN = 1, BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1, }; extern void __blk_mq_tag_busy(struct blk_mq_hw_ctx *); extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_tag_busy(hctx); } static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) return; __blk_mq_tag_idle(hctx); } static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, unsigned int tag) { return tag < tags->nr_reserved_tags; } #endif |
1 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Cryptographic API. * * DES & Triple DES EDE Cipher Algorithms. * * Copyright (c) 2005 Dag Arne Osvik <da@osvik.no> */ #include <asm/byteorder.h> #include <linux/bitops.h> #include <linux/init.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/crypto.h> #include <crypto/internal/des.h> static int des_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen) { struct des_ctx *dctx = crypto_tfm_ctx(tfm); int err; err = des_expand_key(dctx, key, keylen); if (err == -ENOKEY) { if (crypto_tfm_get_flags(tfm) & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS) err = -EINVAL; else err = 0; } if (err) memset(dctx, 0, sizeof(*dctx)); return err; } static void crypto_des_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { const struct des_ctx *dctx = crypto_tfm_ctx(tfm); des_encrypt(dctx, dst, src); } static void crypto_des_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { const struct des_ctx *dctx = crypto_tfm_ctx(tfm); des_decrypt(dctx, dst, src); } static int des3_ede_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen) { struct des3_ede_ctx *dctx = crypto_tfm_ctx(tfm); int err; err = des3_ede_expand_key(dctx, key, keylen); if (err == -ENOKEY) { if (crypto_tfm_get_flags(tfm) & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS) err = -EINVAL; else err = 0; } if (err) memset(dctx, 0, sizeof(*dctx)); return err; } static void crypto_des3_ede_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { const struct des3_ede_ctx *dctx = crypto_tfm_ctx(tfm); des3_ede_encrypt(dctx, dst, src); } static void crypto_des3_ede_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { const struct des3_ede_ctx *dctx = crypto_tfm_ctx(tfm); des3_ede_decrypt(dctx, dst, src); } static struct crypto_alg des_algs[2] = { { .cra_name = "des", .cra_driver_name = "des-generic", .cra_priority = 100, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = DES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct des_ctx), .cra_module = THIS_MODULE, .cra_u = { .cipher = { .cia_min_keysize = DES_KEY_SIZE, .cia_max_keysize = DES_KEY_SIZE, .cia_setkey = des_setkey, .cia_encrypt = crypto_des_encrypt, .cia_decrypt = crypto_des_decrypt } } }, { .cra_name = "des3_ede", .cra_driver_name = "des3_ede-generic", .cra_priority = 100, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = DES3_EDE_BLOCK_SIZE, .cra_ctxsize = sizeof(struct des3_ede_ctx), .cra_module = THIS_MODULE, .cra_u = { .cipher = { .cia_min_keysize = DES3_EDE_KEY_SIZE, .cia_max_keysize = DES3_EDE_KEY_SIZE, .cia_setkey = des3_ede_setkey, .cia_encrypt = crypto_des3_ede_encrypt, .cia_decrypt = crypto_des3_ede_decrypt } } } }; static int __init des_generic_mod_init(void) { return crypto_register_algs(des_algs, ARRAY_SIZE(des_algs)); } static void __exit des_generic_mod_fini(void) { crypto_unregister_algs(des_algs, ARRAY_SIZE(des_algs)); } subsys_initcall(des_generic_mod_init); module_exit(des_generic_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("DES & Triple DES EDE Cipher Algorithms"); MODULE_AUTHOR("Dag Arne Osvik <da@osvik.no>"); MODULE_ALIAS_CRYPTO("des"); MODULE_ALIAS_CRYPTO("des-generic"); MODULE_ALIAS_CRYPTO("des3_ede"); MODULE_ALIAS_CRYPTO("des3_ede-generic"); |
71 10 10 8 2 67 1 2 64 65 1 63 62 62 62 59 62 9 9 1 7 1 1 1 1 1 1 1 1 1 1 2 2 6 6 6 6 6 6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 | // SPDX-License-Identifier: GPL-2.0-only /* * net/sched/sch_mq.c Classful multiqueue dummy scheduler * * Copyright (c) 2009 Patrick McHardy <kaber@trash.net> */ #include <linux/types.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <net/netlink.h> #include <net/pkt_cls.h> #include <net/pkt_sched.h> #include <net/sch_generic.h> struct mq_sched { struct Qdisc **qdiscs; }; static int mq_offload(struct Qdisc *sch, enum tc_mq_command cmd) { struct net_device *dev = qdisc_dev(sch); struct tc_mq_qopt_offload opt = { .command = cmd, .handle = sch->handle, }; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return -EOPNOTSUPP; return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt); } static int mq_offload_stats(struct Qdisc *sch) { struct tc_mq_qopt_offload opt = { .command = TC_MQ_STATS, .handle = sch->handle, .stats = { .bstats = &sch->bstats, .qstats = &sch->qstats, }, }; return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_MQ, &opt); } static void mq_destroy(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct mq_sched *priv = qdisc_priv(sch); unsigned int ntx; mq_offload(sch, TC_MQ_DESTROY); if (!priv->qdiscs) return; for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++) qdisc_put(priv->qdiscs[ntx]); kfree(priv->qdiscs); } static int mq_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct net_device *dev = qdisc_dev(sch); struct mq_sched *priv = qdisc_priv(sch); struct netdev_queue *dev_queue; struct Qdisc *qdisc; unsigned int ntx; if (sch->parent != TC_H_ROOT) return -EOPNOTSUPP; if (!netif_is_multiqueue(dev)) return -EOPNOTSUPP; /* pre-allocate qdiscs, attachment can't fail */ priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]), GFP_KERNEL); if (!priv->qdiscs) return -ENOMEM; for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { dev_queue = netdev_get_tx_queue(dev, ntx); qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, ntx), TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(ntx + 1)), extack); if (!qdisc) return -ENOMEM; priv->qdiscs[ntx] = qdisc; qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; } sch->flags |= TCQ_F_MQROOT; mq_offload(sch, TC_MQ_CREATE); return 0; } static void mq_attach(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct mq_sched *priv = qdisc_priv(sch); struct Qdisc *qdisc, *old; unsigned int ntx; for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { qdisc = priv->qdiscs[ntx]; old = dev_graft_qdisc(qdisc->dev_queue, qdisc); if (old) qdisc_put(old); #ifdef CONFIG_NET_SCHED if (ntx < dev->real_num_tx_queues) qdisc_hash_add(qdisc, false); #endif } kfree(priv->qdiscs); priv->qdiscs = NULL; } static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct net_device *dev = qdisc_dev(sch); struct Qdisc *qdisc; unsigned int ntx; sch->q.qlen = 0; gnet_stats_basic_sync_init(&sch->bstats); memset(&sch->qstats, 0, sizeof(sch->qstats)); /* MQ supports lockless qdiscs. However, statistics accounting needs * to account for all, none, or a mix of locked and unlocked child * qdiscs. Percpu stats are added to counters in-band and locking * qdisc totals are added at end. */ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { qdisc = rtnl_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping); spin_lock_bh(qdisc_lock(qdisc)); gnet_stats_add_basic(&sch->bstats, qdisc->cpu_bstats, &qdisc->bstats, false); gnet_stats_add_queue(&sch->qstats, qdisc->cpu_qstats, &qdisc->qstats); sch->q.qlen += qdisc_qlen(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); } return mq_offload_stats(sch); } static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl) { struct net_device *dev = qdisc_dev(sch); unsigned long ntx = cl - 1; if (ntx >= dev->num_tx_queues) return NULL; return netdev_get_tx_queue(dev, ntx); } static struct netdev_queue *mq_select_queue(struct Qdisc *sch, struct tcmsg *tcm) { return mq_queue_get(sch, TC_H_MIN(tcm->tcm_parent)); } static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct netdev_queue *dev_queue = mq_queue_get(sch, cl); struct tc_mq_qopt_offload graft_offload; struct net_device *dev = qdisc_dev(sch); if (dev->flags & IFF_UP) dev_deactivate(dev); *old = dev_graft_qdisc(dev_queue, new); if (new) new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) dev_activate(dev); graft_offload.handle = sch->handle; graft_offload.graft_params.queue = cl - 1; graft_offload.graft_params.child_handle = new ? new->handle : 0; graft_offload.command = TC_MQ_GRAFT; qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, *old, TC_SETUP_QDISC_MQ, &graft_offload, extack); return 0; } static struct Qdisc *mq_leaf(struct Qdisc *sch, unsigned long cl) { struct netdev_queue *dev_queue = mq_queue_get(sch, cl); return rtnl_dereference(dev_queue->qdisc_sleeping); } static unsigned long mq_find(struct Qdisc *sch, u32 classid) { unsigned int ntx = TC_H_MIN(classid); if (!mq_queue_get(sch, ntx)) return 0; return ntx; } static int mq_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { struct netdev_queue *dev_queue = mq_queue_get(sch, cl); tcm->tcm_parent = TC_H_ROOT; tcm->tcm_handle |= TC_H_MIN(cl); tcm->tcm_info = rtnl_dereference(dev_queue->qdisc_sleeping)->handle; return 0; } static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) { struct netdev_queue *dev_queue = mq_queue_get(sch, cl); sch = rtnl_dereference(dev_queue->qdisc_sleeping); if (gnet_stats_copy_basic(d, sch->cpu_bstats, &sch->bstats, true) < 0 || qdisc_qstats_copy(d, sch) < 0) return -1; return 0; } static void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct net_device *dev = qdisc_dev(sch); unsigned int ntx; if (arg->stop) return; arg->count = arg->skip; for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) { if (!tc_qdisc_stats_dump(sch, ntx + 1, arg)) break; } } static const struct Qdisc_class_ops mq_class_ops = { .select_queue = mq_select_queue, .graft = mq_graft, .leaf = mq_leaf, .find = mq_find, .walk = mq_walk, .dump = mq_dump_class, .dump_stats = mq_dump_class_stats, }; struct Qdisc_ops mq_qdisc_ops __read_mostly = { .cl_ops = &mq_class_ops, .id = "mq", .priv_size = sizeof(struct mq_sched), .init = mq_init, .destroy = mq_destroy, .attach = mq_attach, .change_real_num_tx = mq_change_real_num_tx, .dump = mq_dump, .owner = THIS_MODULE, }; |
184 256 257 256 9 45 173 20 11 75 60 6 4 54 5 53 2 37 77 77 77 44 58 6 42 21 1 9 9 6 1 5 1 1 8 32 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 | // SPDX-License-Identifier: GPL-2.0-only /* * IPv6 library code, needed by static components when full IPv6 support is * not configured or static. */ #include <linux/export.h> #include <net/ipv6.h> /* * find out if nexthdr is a well-known extension header or a protocol */ bool ipv6_ext_hdr(u8 nexthdr) { /* * find out if nexthdr is an extension header or a protocol */ return (nexthdr == NEXTHDR_HOP) || (nexthdr == NEXTHDR_ROUTING) || (nexthdr == NEXTHDR_FRAGMENT) || (nexthdr == NEXTHDR_AUTH) || (nexthdr == NEXTHDR_NONE) || (nexthdr == NEXTHDR_DEST); } EXPORT_SYMBOL(ipv6_ext_hdr); /* * Skip any extension headers. This is used by the ICMP module. * * Note that strictly speaking this conflicts with RFC 2460 4.0: * ...The contents and semantics of each extension header determine whether * or not to proceed to the next header. Therefore, extension headers must * be processed strictly in the order they appear in the packet; a * receiver must not, for example, scan through a packet looking for a * particular kind of extension header and process that header prior to * processing all preceding ones. * * We do exactly this. This is a protocol bug. We can't decide after a * seeing an unknown discard-with-error flavour TLV option if it's a * ICMP error message or not (errors should never be send in reply to * ICMP error messages). * * But I see no other way to do this. This might need to be reexamined * when Linux implements ESP (and maybe AUTH) headers. * --AK * * This function parses (probably truncated) exthdr set "hdr". * "nexthdrp" initially points to some place, * where type of the first header can be found. * * It skips all well-known exthdrs, and returns pointer to the start * of unparsable area i.e. the first header with unknown type. * If it is not NULL *nexthdr is updated by type/protocol of this header. * * NOTES: - if packet terminated with NEXTHDR_NONE it returns NULL. * - it may return pointer pointing beyond end of packet, * if the last recognized header is truncated in the middle. * - if packet is truncated, so that all parsed headers are skipped, * it returns NULL. * - First fragment header is skipped, not-first ones * are considered as unparsable. * - Reports the offset field of the final fragment header so it is * possible to tell whether this is a first fragment, later fragment, * or not fragmented. * - ESP is unparsable for now and considered like * normal payload protocol. * - Note also special handling of AUTH header. Thanks to IPsec wizards. * * --ANK (980726) */ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, __be16 *frag_offp) { u8 nexthdr = *nexthdrp; *frag_offp = 0; while (ipv6_ext_hdr(nexthdr)) { struct ipv6_opt_hdr _hdr, *hp; int hdrlen; if (nexthdr == NEXTHDR_NONE) return -1; hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); if (!hp) return -1; if (nexthdr == NEXTHDR_FRAGMENT) { __be16 _frag_off, *fp; fp = skb_header_pointer(skb, start+offsetof(struct frag_hdr, frag_off), sizeof(_frag_off), &_frag_off); if (!fp) return -1; *frag_offp = *fp; if (ntohs(*frag_offp) & ~0x7) break; hdrlen = 8; } else if (nexthdr == NEXTHDR_AUTH) hdrlen = ipv6_authlen(hp); else hdrlen = ipv6_optlen(hp); nexthdr = hp->nexthdr; start += hdrlen; } *nexthdrp = nexthdr; return start; } EXPORT_SYMBOL(ipv6_skip_exthdr); int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type) { const unsigned char *nh = skb_network_header(skb); int packet_len = skb_tail_pointer(skb) - skb_network_header(skb); struct ipv6_opt_hdr *hdr; int len; if (offset + 2 > packet_len) goto bad; hdr = (struct ipv6_opt_hdr *)(nh + offset); len = ((hdr->hdrlen + 1) << 3); if (offset + len > packet_len) goto bad; offset += 2; len -= 2; while (len > 0) { int opttype = nh[offset]; int optlen; if (opttype == type) return offset; switch (opttype) { case IPV6_TLV_PAD1: optlen = 1; break; default: if (len < 2) goto bad; optlen = nh[offset + 1] + 2; if (optlen > len) goto bad; break; } offset += optlen; len -= optlen; } /* not_found */ bad: return -1; } EXPORT_SYMBOL_GPL(ipv6_find_tlv); /* * find the offset to specified header or the protocol number of last header * if target < 0. "last header" is transport protocol header, ESP, or * "No next header". * * Note that *offset is used as input/output parameter, and if it is not zero, * then it must be a valid offset to an inner IPv6 header. This can be used * to explore inner IPv6 header, eg. ICMPv6 error messages. * * If target header is found, its offset is set in *offset and return protocol * number. Otherwise, return -1. * * If the first fragment doesn't contain the final protocol header or * NEXTHDR_NONE it is considered invalid. * * Note that non-1st fragment is special case that "the protocol number * of last header" is "next header" field in Fragment header. In this case, * *offset is meaningless and fragment offset is stored in *fragoff if fragoff * isn't NULL. * * if flags is not NULL and it's a fragment, then the frag flag * IP6_FH_F_FRAG will be set. If it's an AH header, the * IP6_FH_F_AUTH flag is set and target < 0, then this function will * stop at the AH header. If IP6_FH_F_SKIP_RH flag was passed, then this * function will skip all those routing headers, where segements_left was 0. */ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, int target, unsigned short *fragoff, int *flags) { unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); u8 nexthdr = ipv6_hdr(skb)->nexthdr; bool found; if (fragoff) *fragoff = 0; if (*offset) { struct ipv6hdr _ip6, *ip6; ip6 = skb_header_pointer(skb, *offset, sizeof(_ip6), &_ip6); if (!ip6 || (ip6->version != 6)) return -EBADMSG; start = *offset + sizeof(struct ipv6hdr); nexthdr = ip6->nexthdr; } do { struct ipv6_opt_hdr _hdr, *hp; unsigned int hdrlen; found = (nexthdr == target); if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) { if (target < 0 || found) break; return -ENOENT; } hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); if (!hp) return -EBADMSG; if (nexthdr == NEXTHDR_ROUTING) { struct ipv6_rt_hdr _rh, *rh; rh = skb_header_pointer(skb, start, sizeof(_rh), &_rh); if (!rh) return -EBADMSG; if (flags && (*flags & IP6_FH_F_SKIP_RH) && rh->segments_left == 0) found = false; } if (nexthdr == NEXTHDR_FRAGMENT) { unsigned short _frag_off; __be16 *fp; if (flags) /* Indicate that this is a fragment */ *flags |= IP6_FH_F_FRAG; fp = skb_header_pointer(skb, start+offsetof(struct frag_hdr, frag_off), sizeof(_frag_off), &_frag_off); if (!fp) return -EBADMSG; _frag_off = ntohs(*fp) & ~0x7; if (_frag_off) { if (target < 0 && ((!ipv6_ext_hdr(hp->nexthdr)) || hp->nexthdr == NEXTHDR_NONE)) { if (fragoff) *fragoff = _frag_off; return hp->nexthdr; } if (!found) return -ENOENT; if (fragoff) *fragoff = _frag_off; break; } hdrlen = 8; } else if (nexthdr == NEXTHDR_AUTH) { if (flags && (*flags & IP6_FH_F_AUTH) && (target < 0)) break; hdrlen = ipv6_authlen(hp); } else hdrlen = ipv6_optlen(hp); if (!found) { nexthdr = hp->nexthdr; start += hdrlen; } } while (!found); *offset = start; return nexthdr; } EXPORT_SYMBOL(ipv6_find_hdr); |
2651 2650 2651 516 513 513 514 514 453 143 47 516 20 514 516 516 516 241 241 241 241 2651 2624 2648 2648 2649 2647 2649 2651 2649 2648 2651 2649 2651 190 2588 2651 2650 2649 2650 2651 2649 2651 536 2623 242 515 2650 2647 537 2651 2646 537 2651 516 2651 515 2 2651 2649 2651 2651 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 | // SPDX-License-Identifier: GPL-2.0 /* * Generic ring buffer * * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> */ #include <linux/trace_recursion.h> #include <linux/ring_buffer_ext.h> #include <linux/trace_events.h> #include <linux/ring_buffer.h> #include <linux/trace_clock.h> #include <linux/sched/clock.h> #include <linux/trace_seq.h> #include <linux/spinlock.h> #include <linux/irq_work.h> #include <linux/security.h> #include <linux/uaccess.h> #include <linux/hardirq.h> #include <linux/kthread.h> /* for self test */ #include <linux/module.h> #include <linux/percpu.h> #include <linux/mutex.h> #include <linux/delay.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/hash.h> #include <linux/list.h> #include <linux/cpu.h> #include <linux/oom.h> #include <asm/local.h> /* * The "absolute" timestamp in the buffer is only 59 bits. * If a clock has the 5 MSBs set, it needs to be saved and * reinserted. */ #define TS_MSB (0xf8ULL << 56) #define ABS_TS_MASK (~TS_MSB) static void update_pages_handler(struct work_struct *work); /* * The ring buffer header is special. We must manually up keep it. */ int ring_buffer_print_entry_header(struct trace_seq *s) { trace_seq_puts(s, "# compressed entry header\n"); trace_seq_puts(s, "\ttype_len : 5 bits\n"); trace_seq_puts(s, "\ttime_delta : 27 bits\n"); trace_seq_puts(s, "\tarray : 32 bits\n"); trace_seq_putc(s, '\n'); trace_seq_printf(s, "\tpadding : type == %d\n", RINGBUF_TYPE_PADDING); trace_seq_printf(s, "\ttime_extend : type == %d\n", RINGBUF_TYPE_TIME_EXTEND); trace_seq_printf(s, "\ttime_stamp : type == %d\n", RINGBUF_TYPE_TIME_STAMP); trace_seq_printf(s, "\tdata max type_len == %d\n", RINGBUF_TYPE_DATA_TYPE_LEN_MAX); return !trace_seq_has_overflowed(s); } /* * The ring buffer is made up of a list of pages. A separate list of pages is * allocated for each CPU. A writer may only write to a buffer that is * associated with the CPU it is currently executing on. A reader may read * from any per cpu buffer. * * The reader is special. For each per cpu buffer, the reader has its own * reader page. When a reader has read the entire reader page, this reader * page is swapped with another page in the ring buffer. * * Now, as long as the writer is off the reader page, the reader can do what * ever it wants with that page. The writer will never write to that page * again (as long as it is out of the ring buffer). * * Here's some silly ASCII art. * * +------+ * |reader| RING BUFFER * |page | * +------+ +---+ +---+ +---+ * | |-->| |-->| | * +---+ +---+ +---+ * ^ | * | | * +---------------+ * * * +------+ * |reader| RING BUFFER * |page |------------------v * +------+ +---+ +---+ +---+ * | |-->| |-->| | * +---+ +---+ +---+ * ^ | * | | * +---------------+ * * * +------+ * |reader| RING BUFFER * |page |------------------v * +------+ +---+ +---+ +---+ * ^ | |-->| |-->| | * | +---+ +---+ +---+ * | | * | | * +------------------------------+ * * * +------+ * |buffer| RING BUFFER * |page |------------------v * +------+ +---+ +---+ +---+ * ^ | | | |-->| | * | New +---+ +---+ +---+ * | Reader------^ | * | page | * +------------------------------+ * * * After we make this swap, the reader can hand this page off to the splice * code and be done with it. It can even allocate a new page if it needs to * and swap that into the ring buffer. * * We will be using cmpxchg soon to make all this lockless. * */ /* Used for individual buffers (after the counter) */ #define RB_BUFFER_OFF (1 << 20) /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX enum { RB_LEN_TIME_EXTEND = 8, RB_LEN_TIME_STAMP = 8, }; #define skip_time_extend(event) \ ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND)) #define extended_time(event) \ (event->type_len >= RINGBUF_TYPE_TIME_EXTEND) static inline int rb_null_event(struct ring_buffer_event *event) { return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; } static void rb_event_set_padding(struct ring_buffer_event *event) { /* padding has a NULL time_delta */ event->type_len = RINGBUF_TYPE_PADDING; event->time_delta = 0; } static unsigned rb_event_data_length(struct ring_buffer_event *event) { unsigned length; if (event->type_len) length = event->type_len * RB_ALIGNMENT; else length = event->array[0]; return length + RB_EVNT_HDR_SIZE; } /* * Return the length of the given event. Will return * the length of the time extend if the event is a * time extend. */ static inline unsigned rb_event_length(struct ring_buffer_event *event) { switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) /* undefined */ return -1; return event->array[0] + RB_EVNT_HDR_SIZE; case RINGBUF_TYPE_TIME_EXTEND: return RB_LEN_TIME_EXTEND; case RINGBUF_TYPE_TIME_STAMP: return RB_LEN_TIME_STAMP; case RINGBUF_TYPE_DATA: return rb_event_data_length(event); default: WARN_ON_ONCE(1); } /* not hit */ return 0; } /* * Return total length of time extend and data, * or just the event length for all other events. */ static inline unsigned rb_event_ts_length(struct ring_buffer_event *event) { unsigned len = 0; if (extended_time(event)) { /* time extends include the data event after it */ len = RB_LEN_TIME_EXTEND; event = skip_time_extend(event); } return len + rb_event_length(event); } /** * ring_buffer_event_length - return the length of the event * @event: the event to get the length of * * Returns the size of the data load of a data event. * If the event is something other than a data event, it * returns the size of the event itself. With the exception * of a TIME EXTEND, where it still returns the size of the * data load of the data event after it. */ unsigned ring_buffer_event_length(struct ring_buffer_event *event) { unsigned length; if (extended_time(event)) event = skip_time_extend(event); length = rb_event_length(event); if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) return length; length -= RB_EVNT_HDR_SIZE; if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) length -= sizeof(event->array[0]); return length; } EXPORT_SYMBOL_GPL(ring_buffer_event_length); /* inline for ring buffer fast paths */ static __always_inline void * rb_event_data(struct ring_buffer_event *event) { if (extended_time(event)) event = skip_time_extend(event); WARN_ON_ONCE(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); /* If length is in len field, then array[0] has the data */ if (event->type_len) return (void *)&event->array[0]; /* Otherwise length is in array[0] and array[1] has the data */ return (void *)&event->array[1]; } /** * ring_buffer_event_data - return the data of the event * @event: the event to get the data from */ void *ring_buffer_event_data(struct ring_buffer_event *event) { return rb_event_data(event); } EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define for_each_buffer_cpu(buffer, cpu) \ for_each_cpu(cpu, buffer->cpumask) #define for_each_online_buffer_cpu(buffer, cpu) \ for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask) static u64 rb_event_time_stamp(struct ring_buffer_event *event) { u64 ts; ts = event->array[0]; ts <<= TS_SHIFT; ts += event->time_delta; return ts; } /* Flag when events were overwritten */ #define RB_MISSED_EVENTS (1 << 31) /* Missed count stored at end */ #define RB_MISSED_STORED (1 << 30) /* * Note, the buffer_page list must be first. The buffer pages * are allocated in cache lines, which means that each buffer * page will be at the beginning of a cache line, and thus * the least significant bits will be zero. We use this to * add flags in the list struct pointers, to make the ring buffer * lockless. */ struct buffer_page { struct list_head list; /* list of buffer pages */ local_t write; /* index for next write */ unsigned read; /* index for next read */ local_t entries; /* entries on this page */ unsigned long real_end; /* real end of data */ struct buffer_data_page *page; /* Actual data page */ }; /* * The buffer page counters, write and entries, must be reset * atomically when crossing page boundaries. To synchronize this * update, two counters are inserted into the number. One is * the actual counter for the write position or count on the page. * * The other is a counter of updaters. Before an update happens * the update partition of the counter is incremented. This will * allow the updater to update the counter atomically. * * The counter is 20 bits, and the state data is 12. */ #define RB_WRITE_MASK 0xfffff #define RB_WRITE_INTCNT (1 << 20) static void rb_init_page(struct buffer_data_page *bpage) { local_set(&bpage->commit, 0); } static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage) { return local_read(&bpage->page->commit); } static void free_buffer_page(struct buffer_page *bpage) { free_page((unsigned long)bpage->page); kfree(bpage); } /* Max payload is BUF_PAGE_SIZE - header (8bytes) */ #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) int ring_buffer_print_page_header(struct trace_seq *s) { struct buffer_data_page field; trace_seq_printf(s, "\tfield: u64 timestamp;\t" "offset:0;\tsize:%u;\tsigned:%u;\n", (unsigned int)sizeof(field.time_stamp), (unsigned int)is_signed_type(u64)); trace_seq_printf(s, "\tfield: local_t commit;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), commit), (unsigned int)sizeof(field.commit), (unsigned int)is_signed_type(long)); trace_seq_printf(s, "\tfield: int overwrite;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), commit), 1, (unsigned int)is_signed_type(long)); trace_seq_printf(s, "\tfield: char data;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), data), (unsigned int)BUF_PAGE_SIZE, (unsigned int)is_signed_type(char)); return !trace_seq_has_overflowed(s); } struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; wait_queue_head_t full_waiters; bool waiters_pending; bool full_waiters_pending; bool wakeup_full; }; /* * Structure to hold event state and handle nested events. */ struct rb_event_info { u64 ts; u64 delta; u64 before; u64 after; unsigned long length; struct buffer_page *tail_page; int add_timestamp; }; /* * Used for the add_timestamp * NONE * EXTEND - wants a time extend * ABSOLUTE - the buffer requests all events to have absolute time stamps * FORCE - force a full time stamp. */ enum { RB_ADD_STAMP_NONE = 0, RB_ADD_STAMP_EXTEND = BIT(1), RB_ADD_STAMP_ABSOLUTE = BIT(2), RB_ADD_STAMP_FORCE = BIT(3) }; /* * Used for which event context the event is in. * TRANSITION = 0 * NMI = 1 * IRQ = 2 * SOFTIRQ = 3 * NORMAL = 4 * * See trace_recursive_lock() comment below for more details. */ enum { RB_CTX_TRANSITION, RB_CTX_NMI, RB_CTX_IRQ, RB_CTX_SOFTIRQ, RB_CTX_NORMAL, RB_CTX_MAX }; #if BITS_PER_LONG == 32 #define RB_TIME_32 #endif /* To test on 64 bit machines */ //#define RB_TIME_32 #ifdef RB_TIME_32 struct rb_time_struct { local_t cnt; local_t top; local_t bottom; local_t msb; }; #else #include <asm/local64.h> struct rb_time_struct { local64_t time; }; #endif typedef struct rb_time_struct rb_time_t; #define MAX_NEST 5 /* * head_page == tail_page && head == tail then buffer is empty. */ struct ring_buffer_per_cpu { int cpu; atomic_t record_disabled; atomic_t resize_disabled; struct trace_buffer *buffer; raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; struct lock_class_key lock_key; struct buffer_data_page *free_page; unsigned long nr_pages; unsigned int current_context; struct list_head *pages; struct buffer_page *head_page; /* read from head */ struct buffer_page *tail_page; /* write to tail */ struct buffer_page *commit_page; /* committed pages */ struct buffer_page *reader_page; unsigned long lost_events; unsigned long last_overrun; unsigned long nest; local_t entries_bytes; local_t entries; local_t overrun; local_t commit_overrun; local_t dropped_events; local_t committing; local_t commits; local_t pages_touched; local_t pages_lost; local_t pages_read; long last_pages_touch; size_t shortest_full; unsigned long read; unsigned long read_bytes; rb_time_t write_stamp; rb_time_t before_stamp; u64 event_stamp[MAX_NEST]; u64 read_stamp; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ struct work_struct update_pages_work; struct completion update_done; struct rb_irq_work irq_work; }; struct trace_buffer { unsigned flags; int cpus; atomic_t record_disabled; atomic_t resizing; cpumask_var_t cpumask; struct lock_class_key *reader_lock_key; struct mutex mutex; struct ring_buffer_per_cpu **buffers; struct hlist_node node; u64 (*clock)(void); struct rb_irq_work irq_work; bool time_stamp_abs; struct ring_buffer_ext_cb *ext_cb; }; struct ring_buffer_iter { struct ring_buffer_per_cpu *cpu_buffer; unsigned long head; unsigned long next_event; struct buffer_page *head_page; struct buffer_page *cache_reader_page; unsigned long cache_read; u64 read_stamp; u64 page_stamp; struct ring_buffer_event *event; int missed_events; }; #ifdef RB_TIME_32 /* * On 32 bit machines, local64_t is very expensive. As the ring * buffer doesn't need all the features of a true 64 bit atomic, * on 32 bit, it uses these functions (64 still uses local64_t). * * For the ring buffer, 64 bit required operations for the time is * the following: * * - Reads may fail if it interrupted a modification of the time stamp. * It will succeed if it did not interrupt another write even if * the read itself is interrupted by a write. * It returns whether it was successful or not. * * - Writes always succeed and will overwrite other writes and writes * that were done by events interrupting the current write. * * - A write followed by a read of the same time stamp will always succeed, * but may not contain the same value. * * - A cmpxchg will fail if it interrupted another write or cmpxchg. * Other than that, it acts like a normal cmpxchg. * * The 60 bit time stamp is broken up by 30 bits in a top and bottom half * (bottom being the least significant 30 bits of the 60 bit time stamp). * * The two most significant bits of each half holds a 2 bit counter (0-3). * Each update will increment this counter by one. * When reading the top and bottom, if the two counter bits match then the * top and bottom together make a valid 60 bit number. */ #define RB_TIME_SHIFT 30 #define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1) #define RB_TIME_MSB_SHIFT 60 static inline int rb_time_cnt(unsigned long val) { return (val >> RB_TIME_SHIFT) & 3; } static inline u64 rb_time_val(unsigned long top, unsigned long bottom) { u64 val; val = top & RB_TIME_VAL_MASK; val <<= RB_TIME_SHIFT; val |= bottom & RB_TIME_VAL_MASK; return val; } static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) { unsigned long top, bottom, msb; unsigned long c; /* * If the read is interrupted by a write, then the cnt will * be different. Loop until both top and bottom have been read * without interruption. */ do { c = local_read(&t->cnt); top = local_read(&t->top); bottom = local_read(&t->bottom); msb = local_read(&t->msb); } while (c != local_read(&t->cnt)); *cnt = rb_time_cnt(top); /* If top, msb or bottom counts don't match, this interrupted a write */ if (*cnt != rb_time_cnt(msb) || *cnt != rb_time_cnt(bottom)) return false; /* The shift to msb will lose its cnt bits */ *ret = rb_time_val(top, bottom) | ((u64)msb << RB_TIME_MSB_SHIFT); return true; } static bool rb_time_read(rb_time_t *t, u64 *ret) { unsigned long cnt; return __rb_time_read(t, ret, &cnt); } static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt) { return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT); } static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom, unsigned long *msb) { *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK); *bottom = (unsigned long)(val & RB_TIME_VAL_MASK); *msb = (unsigned long)(val >> RB_TIME_MSB_SHIFT); } static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt) { val = rb_time_val_cnt(val, cnt); local_set(t, val); } static void rb_time_set(rb_time_t *t, u64 val) { unsigned long cnt, top, bottom, msb; rb_time_split(val, &top, &bottom, &msb); /* Writes always succeed with a valid number even if it gets interrupted. */ do { cnt = local_inc_return(&t->cnt); rb_time_val_set(&t->top, top, cnt); rb_time_val_set(&t->bottom, bottom, cnt); rb_time_val_set(&t->msb, val >> RB_TIME_MSB_SHIFT, cnt); } while (cnt != local_read(&t->cnt)); } static inline bool rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set) { unsigned long ret; ret = local_cmpxchg(l, expect, set); return ret == expect; } #else /* 64 bits */ /* local64_t always succeeds */ static inline bool rb_time_read(rb_time_t *t, u64 *ret) { *ret = local64_read(&t->time); return true; } static void rb_time_set(rb_time_t *t, u64 val) { local64_set(&t->time, val); } #endif static inline bool has_ext_writer(struct trace_buffer *buffer) { return !!buffer->ext_cb; } static inline bool rb_has_ext_writer(struct ring_buffer_per_cpu *cpu_buffer) { return has_ext_writer(cpu_buffer->buffer); } /* * Enable this to make sure that the event passed to * ring_buffer_event_time_stamp() is not committed and also * is on the buffer that it passed in. */ //#define RB_VERIFY_EVENT #ifdef RB_VERIFY_EVENT static struct list_head *rb_list_head(struct list_head *list); static void verify_event(struct ring_buffer_per_cpu *cpu_buffer, void *event) { struct buffer_page *page = cpu_buffer->commit_page; struct buffer_page *tail_page = READ_ONCE(cpu_buffer->tail_page); struct list_head *next; long commit, write; unsigned long addr = (unsigned long)event; bool done = false; int stop = 0; /* Make sure the event exists and is not committed yet */ do { if (page == tail_page || WARN_ON_ONCE(stop++ > 100)) done = true; commit = local_read(&page->page->commit); write = local_read(&page->write); if (addr >= (unsigned long)&page->page->data[commit] && addr < (unsigned long)&page->page->data[write]) return; next = rb_list_head(page->list.next); page = list_entry(next, struct buffer_page, list); } while (!done); WARN_ON_ONCE(1); } #else static inline void verify_event(struct ring_buffer_per_cpu *cpu_buffer, void *event) { } #endif /* * The absolute time stamp drops the 5 MSBs and some clocks may * require them. The rb_fix_abs_ts() will take a previous full * time stamp, and add the 5 MSB of that time stamp on to the * saved absolute time stamp. Then they are compared in case of * the unlikely event that the latest time stamp incremented * the 5 MSB. */ static inline u64 rb_fix_abs_ts(u64 abs, u64 save_ts) { if (save_ts & TS_MSB) { abs |= save_ts & TS_MSB; /* Check for overflow */ if (unlikely(abs < save_ts)) abs += 1ULL << 59; } return abs; } static inline u64 rb_time_stamp(struct trace_buffer *buffer); /** * ring_buffer_event_time_stamp - return the event's current time stamp * @buffer: The buffer that the event is on * @event: the event to get the time stamp of * * Note, this must be called after @event is reserved, and before it is * committed to the ring buffer. And must be called from the same * context where the event was reserved (normal, softirq, irq, etc). * * Returns the time stamp associated with the current event. * If the event has an extended time stamp, then that is used as * the time stamp to return. * In the highly unlikely case that the event was nested more than * the max nesting, then the write_stamp of the buffer is returned, * otherwise current time is returned, but that really neither of * the last two cases should ever happen. */ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *event) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[smp_processor_id()]; unsigned int nest; u64 ts; /* If the event includes an absolute time, then just use that */ if (event->type_len == RINGBUF_TYPE_TIME_STAMP) { ts = rb_event_time_stamp(event); return rb_fix_abs_ts(ts, cpu_buffer->tail_page->page->time_stamp); } nest = local_read(&cpu_buffer->committing); verify_event(cpu_buffer, event); if (WARN_ON_ONCE(!nest)) goto fail; /* Read the current saved nesting level time stamp */ if (likely(--nest < MAX_NEST)) return cpu_buffer->event_stamp[nest]; /* Shouldn't happen, warn if it does */ WARN_ONCE(1, "nest (%d) greater than max", nest); fail: /* Can only fail on 32 bit */ if (!rb_time_read(&cpu_buffer->write_stamp, &ts)) /* Screw it, just read the current time */ ts = rb_time_stamp(cpu_buffer->buffer); return ts; } /** * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer * @buffer: The ring_buffer to get the number of pages from * @cpu: The cpu of the ring_buffer to get the number of pages from * * Returns the number of pages used by a per_cpu buffer of the ring buffer. */ size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu) { return buffer->buffers[cpu]->nr_pages; } /** * ring_buffer_nr_dirty_pages - get the number of used pages in the ring buffer * @buffer: The ring_buffer to get the number of pages from * @cpu: The cpu of the ring_buffer to get the number of pages from * * Returns the number of pages that have content in the ring buffer. */ size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu) { size_t read; size_t lost; size_t cnt; read = local_read(&buffer->buffers[cpu]->pages_read); lost = local_read(&buffer->buffers[cpu]->pages_lost); cnt = local_read(&buffer->buffers[cpu]->pages_touched); if (WARN_ON_ONCE(cnt < lost)) return 0; cnt -= lost; /* The reader can read an empty page, but not more than that */ if (cnt < read) { WARN_ON_ONCE(read > cnt + 1); return 0; } return cnt - read; } static __always_inline bool full_hit(struct trace_buffer *buffer, int cpu, int full) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; size_t nr_pages; size_t dirty; nr_pages = cpu_buffer->nr_pages; if (!nr_pages || !full) return true; /* * Add one as dirty will never equal nr_pages, as the sub-buffer * that the writer is on is not counted as dirty. * This is needed if "buffer_percent" is set to 100. */ dirty = ring_buffer_nr_dirty_pages(buffer, cpu) + 1; return (dirty * 100) >= (full * nr_pages); } /* * rb_wake_up_waiters - wake up tasks waiting for ring buffer input * * Schedules a delayed work to wake up any task that is blocked on the * ring buffer waiters queue. */ static void rb_wake_up_waiters(struct irq_work *work) { struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); wake_up_all(&rbwork->waiters); if (rbwork->full_waiters_pending || rbwork->wakeup_full) { /* Only cpu_buffer sets the above flags */ struct ring_buffer_per_cpu *cpu_buffer = container_of(rbwork, struct ring_buffer_per_cpu, irq_work); /* Called from interrupt context */ raw_spin_lock(&cpu_buffer->reader_lock); rbwork->wakeup_full = false; rbwork->full_waiters_pending = false; /* Waking up all waiters, they will reset the shortest full */ cpu_buffer->shortest_full = 0; raw_spin_unlock(&cpu_buffer->reader_lock); wake_up_all(&rbwork->full_waiters); } } /** * ring_buffer_wake_waiters - wake up any waiters on this ring buffer * @buffer: The ring buffer to wake waiters on * * In the case of a file that represents a ring buffer is closing, * it is prudent to wake up any waiters that are on this. */ void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct rb_irq_work *rbwork; if (!buffer) return; if (cpu == RING_BUFFER_ALL_CPUS) { /* Wake up individual ones too. One level recursion */ for_each_buffer_cpu(buffer, cpu) ring_buffer_wake_waiters(buffer, cpu); rbwork = &buffer->irq_work; } else { if (WARN_ON_ONCE(!buffer->buffers)) return; if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) return; cpu_buffer = buffer->buffers[cpu]; /* The CPU buffer may not have been initialized yet */ if (!cpu_buffer) return; rbwork = &cpu_buffer->irq_work; } /* This can be called in any context */ irq_work_queue(&rbwork->work); } static bool rb_watermark_hit(struct trace_buffer *buffer, int cpu, int full) { struct ring_buffer_per_cpu *cpu_buffer; bool ret = false; /* Reads of all CPUs always waits for any data */ if (cpu == RING_BUFFER_ALL_CPUS) return !ring_buffer_empty(buffer); cpu_buffer = buffer->buffers[cpu]; if (!ring_buffer_empty_cpu(buffer, cpu)) { unsigned long flags; bool pagebusy; if (!full) return true; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; ret = !pagebusy && full_hit(buffer, cpu, full); if (!ret && (!cpu_buffer->shortest_full || cpu_buffer->shortest_full > full)) { cpu_buffer->shortest_full = full; } raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } return ret; } static inline bool rb_wait_cond(struct rb_irq_work *rbwork, struct trace_buffer *buffer, int cpu, int full, ring_buffer_cond_fn cond, void *data) { if (rb_watermark_hit(buffer, cpu, full)) return true; if (cond(data)) return true; /* * The events can happen in critical sections where * checking a work queue can cause deadlocks. * After adding a task to the queue, this flag is set * only to notify events to try to wake up the queue * using irq_work. * * We don't clear it even if the buffer is no longer * empty. The flag only causes the next event to run * irq_work to do the work queue wake up. The worse * that can happen if we race with !trace_empty() is that * an event will cause an irq_work to try to wake up * an empty queue. * * There's no reason to protect this flag either, as * the work queue and irq_work logic will do the necessary * synchronization for the wake ups. The only thing * that is necessary is that the wake up happens after * a task has been queued. It's OK for spurious wake ups. */ if (full) rbwork->full_waiters_pending = true; else rbwork->waiters_pending = true; return false; } /* * The default wait condition for ring_buffer_wait() is to just to exit the * wait loop the first time it is woken up. */ static bool rb_wait_once(void *data) { long *once = data; /* wait_event() actually calls this twice before scheduling*/ if (*once > 1) return true; (*once)++; return false; } /** * ring_buffer_wait - wait for input to the ring buffer * @buffer: buffer to wait on * @cpu: the cpu buffer to wait on * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS * * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon * as data is added to any of the @buffer's cpu buffers. Otherwise * it will wait for data to be added to a specific cpu buffer. */ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) { struct ring_buffer_per_cpu *cpu_buffer; struct wait_queue_head *waitq; ring_buffer_cond_fn cond; struct rb_irq_work *rbwork; void *data; long once = 0; int ret = 0; cond = rb_wait_once; data = &once; /* * Depending on what the caller is waiting for, either any * data in any cpu buffer, or a specific buffer, put the * caller on the appropriate wait queue. */ if (cpu == RING_BUFFER_ALL_CPUS) { rbwork = &buffer->irq_work; /* Full only makes sense on per cpu reads */ full = 0; } else { if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -ENODEV; cpu_buffer = buffer->buffers[cpu]; rbwork = &cpu_buffer->irq_work; } if (full) waitq = &rbwork->full_waiters; else waitq = &rbwork->waiters; ret = wait_event_interruptible((*waitq), rb_wait_cond(rbwork, buffer, cpu, full, cond, data)); return ret; } /** * ring_buffer_poll_wait - poll on buffer input * @buffer: buffer to wait on * @cpu: the cpu buffer to wait on * @filp: the file descriptor * @poll_table: The poll descriptor * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS * * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon * as data is added to any of the @buffer's cpu buffers. Otherwise * it will wait for data to be added to a specific cpu buffer. * * Returns EPOLLIN | EPOLLRDNORM if data exists in the buffers, * zero otherwise. */ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, struct file *filp, poll_table *poll_table, int full) { struct ring_buffer_per_cpu *cpu_buffer; struct rb_irq_work *rbwork; if (cpu == RING_BUFFER_ALL_CPUS) { rbwork = &buffer->irq_work; full = 0; } else { if (!cpumask_test_cpu(cpu, buffer->cpumask)) return EPOLLERR; cpu_buffer = buffer->buffers[cpu]; rbwork = &cpu_buffer->irq_work; } if (full) { unsigned long flags; poll_wait(filp, &rbwork->full_waiters, poll_table); raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); if (!cpu_buffer->shortest_full || cpu_buffer->shortest_full > full) cpu_buffer->shortest_full = full; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); if (full_hit(buffer, cpu, full)) return EPOLLIN | EPOLLRDNORM; /* * Only allow full_waiters_pending update to be seen after * the shortest_full is set. If the writer sees the * full_waiters_pending flag set, it will compare the * amount in the ring buffer to shortest_full. If the amount * in the ring buffer is greater than the shortest_full * percent, it will call the irq_work handler to wake up * this list. The irq_handler will reset shortest_full * back to zero. That's done under the reader_lock, but * the below smp_mb() makes sure that the update to * full_waiters_pending doesn't leak up into the above. */ smp_mb(); rbwork->full_waiters_pending = true; return 0; } poll_wait(filp, &rbwork->waiters, poll_table); rbwork->waiters_pending = true; /* * There's a tight race between setting the waiters_pending and * checking if the ring buffer is empty. Once the waiters_pending bit * is set, the next event will wake the task up, but we can get stuck * if there's only a single event in. * * FIXME: Ideally, we need a memory barrier on the writer side as well, * but adding a memory barrier to all events will cause too much of a * performance hit in the fast path. We only need a memory barrier when * the buffer goes from empty to having content. But as this race is * extremely small, and it's not a problem if another event comes in, we * will fix it later. */ smp_mb(); if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) return EPOLLIN | EPOLLRDNORM; return 0; } /* buffer may be either ring_buffer or ring_buffer_per_cpu */ #define RB_WARN_ON(b, cond) \ ({ \ int _____ret = unlikely(cond); \ if (_____ret) { \ if (__same_type(*(b), struct ring_buffer_per_cpu)) { \ struct ring_buffer_per_cpu *__b = \ (void *)b; \ atomic_inc(&__b->buffer->record_disabled); \ } else \ atomic_inc(&b->record_disabled); \ WARN_ON(1); \ } \ _____ret; \ }) /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0 static inline u64 rb_time_stamp(struct trace_buffer *buffer) { u64 ts; /* Skip retpolines :-( */ if (IS_ENABLED(CONFIG_RETPOLINE) && likely(buffer->clock == trace_clock_local)) ts = trace_clock_local(); else ts = buffer->clock(); /* shift to debug/test normalization and TIME_EXTENTS */ return ts << DEBUG_SHIFT; } u64 ring_buffer_time_stamp(struct trace_buffer *buffer) { u64 time; preempt_disable_notrace(); time = rb_time_stamp(buffer); preempt_enable_notrace(); return time; } EXPORT_SYMBOL_GPL(ring_buffer_time_stamp); void ring_buffer_normalize_time_stamp(struct trace_buffer *buffer, int cpu, u64 *ts) { /* Just stupid testing the normalize function and deltas */ *ts >>= DEBUG_SHIFT; } EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); /* * Making the ring buffer lockless makes things tricky. * Although writes only happen on the CPU that they are on, * and they only need to worry about interrupts. Reads can * happen on any CPU. * * The reader page is always off the ring buffer, but when the * reader finishes with a page, it needs to swap its page with * a new one from the buffer. The reader needs to take from * the head (writes go to the tail). But if a writer is in overwrite * mode and wraps, it must push the head page forward. * * Here lies the problem. * * The reader must be careful to replace only the head page, and * not another one. As described at the top of the file in the * ASCII art, the reader sets its old page to point to the next * page after head. It then sets the page after head to point to * the old reader page. But if the writer moves the head page * during this operation, the reader could end up with the tail. * * We use cmpxchg to help prevent this race. We also do something * special with the page before head. We set the LSB to 1. * * When the writer must push the page forward, it will clear the * bit that points to the head page, move the head, and then set * the bit that points to the new head page. * * We also don't want an interrupt coming in and moving the head * page on another writer. Thus we use the second LSB to catch * that too. Thus: * * head->list->prev->next bit 1 bit 0 * ------- ------- * Normal page 0 0 * Points to head page 0 1 * New head page 1 0 * * Note we can not trust the prev pointer of the head page, because: * * +----+ +-----+ +-----+ * | |------>| T |---X--->| N | * | |<------| | | | * +----+ +-----+ +-----+ * ^ ^ | * | +-----+ | | * +----------| R |----------+ | * | |<-----------+ * +-----+ * * Key: ---X--> HEAD flag set in pointer * T Tail page * R Reader page * N Next page * * (see __rb_reserve_next() to see where this happens) * * What the above shows is that the reader just swapped out * the reader page with a page in the buffer, but before it * could make the new header point back to the new page added * it was preempted by a writer. The writer moved forward onto * the new page added by the reader and is about to move forward * again. * * You can see, it is legitimate for the previous pointer of * the head (or any page) not to point back to itself. But only * temporarily. */ #define RB_PAGE_NORMAL 0UL #define RB_PAGE_HEAD 1UL #define RB_PAGE_UPDATE 2UL #define RB_FLAG_MASK 3UL /* PAGE_MOVED is not part of the mask */ #define RB_PAGE_MOVED 4UL /* * rb_list_head - remove any bit */ static struct list_head *rb_list_head(struct list_head *list) { unsigned long val = (unsigned long)list; return (struct list_head *)(val & ~RB_FLAG_MASK); } /* * rb_is_head_page - test if the given page is the head page * * Because the reader may move the head_page pointer, we can * not trust what the head page is (it may be pointing to * the reader page). But if the next page is a header page, * its flags will be non zero. */ static inline int rb_is_head_page(struct buffer_page *page, struct list_head *list) { unsigned long val; val = (unsigned long)list->next; if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list) return RB_PAGE_MOVED; return val & RB_FLAG_MASK; } /* * rb_is_reader_page * * The unique thing about the reader page, is that, if the * writer is ever on it, the previous pointer never points * back to the reader page. */ static bool rb_is_reader_page(struct buffer_page *page) { struct list_head *list = page->list.prev; return rb_list_head(list->next) != &page->list; } /* * rb_set_list_to_head - set a list_head to be pointing to head. */ static void rb_set_list_to_head(struct list_head *list) { unsigned long *ptr; ptr = (unsigned long *)&list->next; *ptr |= RB_PAGE_HEAD; *ptr &= ~RB_PAGE_UPDATE; } /* * rb_head_page_activate - sets up head page */ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *head; head = cpu_buffer->head_page; if (!head) return; /* * Set the previous list pointer to have the HEAD flag. */ rb_set_list_to_head(head->list.prev); } static void rb_list_head_clear(struct list_head *list) { unsigned long *ptr = (unsigned long *)&list->next; *ptr &= ~RB_FLAG_MASK; } /* * rb_head_page_deactivate - clears head page ptr (for free list) */ static void rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *hd; /* Go through the whole list and clear any pointers found. */ rb_list_head_clear(cpu_buffer->pages); list_for_each(hd, cpu_buffer->pages) rb_list_head_clear(hd); } static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *head, struct buffer_page *prev, int old_flag, int new_flag) { struct list_head *list; unsigned long val = (unsigned long)&head->list; unsigned long ret; list = &prev->list; val &= ~RB_FLAG_MASK; ret = cmpxchg((unsigned long *)&list->next, val | old_flag, val | new_flag); /* check if the reader took the page */ if ((ret & ~RB_FLAG_MASK) != val) return RB_PAGE_MOVED; return ret & RB_FLAG_MASK; } static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *head, struct buffer_page *prev, int old_flag) { return rb_head_page_set(cpu_buffer, head, prev, old_flag, RB_PAGE_UPDATE); } static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *head, struct buffer_page *prev, int old_flag) { return rb_head_page_set(cpu_buffer, head, prev, old_flag, RB_PAGE_HEAD); } static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *head, struct buffer_page *prev, int old_flag) { return rb_head_page_set(cpu_buffer, head, prev, old_flag, RB_PAGE_NORMAL); } static inline void rb_inc_page(struct buffer_page **bpage) { struct list_head *p = rb_list_head((*bpage)->list.next); *bpage = list_entry(p, struct buffer_page, list); } static struct buffer_page * rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *head; struct buffer_page *page; struct list_head *list; int i; if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page)) return NULL; /* sanity check */ list = cpu_buffer->pages; if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list)) return NULL; page = head = cpu_buffer->head_page; /* * It is possible that the writer moves the header behind * where we started, and we miss in one loop. * A second loop should grab the header, but we'll do * three loops just because I'm paranoid. */ for (i = 0; i < 3; i++) { do { if (rb_is_head_page(page, page->list.prev)) { cpu_buffer->head_page = page; return page; } rb_inc_page(&page); } while (page != head); } RB_WARN_ON(cpu_buffer, 1); return NULL; } static int rb_head_page_replace(struct buffer_page *old, struct buffer_page *new) { unsigned long *ptr = (unsigned long *)&old->list.prev->next; unsigned long val; unsigned long ret; val = *ptr & ~RB_FLAG_MASK; val |= RB_PAGE_HEAD; ret = cmpxchg(ptr, val, (unsigned long)&new->list); return ret == val; } /* * rb_tail_page_update - move the tail page forward */ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *tail_page, struct buffer_page *next_page) { unsigned long old_entries; unsigned long old_write; /* * The tail page now needs to be moved forward. * * We need to reset the tail page, but without messing * with possible erasing of data brought in by interrupts * that have moved the tail page and are currently on it. * * We add a counter to the write field to denote this. */ old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write); old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries); /* * Just make sure we have seen our old_write and synchronize * with any interrupts that come in. */ barrier(); /* * If the tail page is still the same as what we think * it is, then it is up to us to update the tail * pointer. */ if (tail_page == READ_ONCE(cpu_buffer->tail_page)) { /* Zero the write counter */ unsigned long val = old_write & ~RB_WRITE_MASK; unsigned long eval = old_entries & ~RB_WRITE_MASK; /* * This will only succeed if an interrupt did * not come in and change it. In which case, we * do not want to modify it. * * We add (void) to let the compiler know that we do not care * about the return value of these functions. We use the * cmpxchg to only update if an interrupt did not already * do it for us. If the cmpxchg fails, we don't care. */ (void)local_cmpxchg(&next_page->write, old_write, val); (void)local_cmpxchg(&next_page->entries, old_entries, eval); /* * No need to worry about races with clearing out the commit. * it only can increment when a commit takes place. But that * only happens in the outer most nested commit. */ local_set(&next_page->page->commit, 0); /* Either we update tail_page or an interrupt does */ if (try_cmpxchg(&cpu_buffer->tail_page, &tail_page, next_page)) local_inc(&cpu_buffer->pages_touched); } } static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage) { unsigned long val = (unsigned long)bpage; if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK)) return 1; return 0; } /** * rb_check_pages - integrity check of buffer pages * @cpu_buffer: CPU buffer with pages to test * * As a safety measure we check to make sure the data pages have not * been corrupted. * * Callers of this function need to guarantee that the list of pages doesn't get * modified during the check. In particular, if it's possible that the function * is invoked with concurrent readers which can swap in a new reader page then * the caller should take cpu_buffer->reader_lock. */ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *head = rb_list_head(cpu_buffer->pages); struct list_head *tmp; if (RB_WARN_ON(cpu_buffer, rb_list_head(rb_list_head(head->next)->prev) != head)) return -1; if (RB_WARN_ON(cpu_buffer, rb_list_head(rb_list_head(head->prev)->next) != head)) return -1; for (tmp = rb_list_head(head->next); tmp != head; tmp = rb_list_head(tmp->next)) { if (RB_WARN_ON(cpu_buffer, rb_list_head(rb_list_head(tmp->next)->prev) != tmp)) return -1; if (RB_WARN_ON(cpu_buffer, rb_list_head(rb_list_head(tmp->prev)->next) != tmp)) return -1; } return 0; } static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, long nr_pages, struct list_head *pages) { struct buffer_page *bpage, *tmp; bool user_thread = current->mm != NULL; gfp_t mflags; long i; /* * Check if the available memory is there first. * Note, si_mem_available() only gives us a rough estimate of available * memory. It may not be accurate. But we don't care, we just want * to prevent doing any allocation when it is obvious that it is * not going to succeed. */ i = si_mem_available(); if (i < nr_pages) return -ENOMEM; /* * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails * gracefully without invoking oom-killer and the system is not * destabilized. */ mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL; /* * If a user thread allocates too much, and si_mem_available() * reports there's enough memory, even though there is not. * Make sure the OOM killer kills this thread. This can happen * even with RETRY_MAYFAIL because another task may be doing * an allocation after this task has taken all memory. * This is the task the OOM killer needs to take out during this * loop, even if it was triggered by an allocation somewhere else. */ if (user_thread) set_current_oom_origin(); for (i = 0; i < nr_pages; i++) { struct page *page; bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), mflags, cpu_to_node(cpu_buffer->cpu)); if (!bpage) goto free_pages; rb_check_bpage(cpu_buffer, bpage); list_add(&bpage->list, pages); page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 0); if (!page) goto free_pages; bpage->page = page_address(page); rb_init_page(bpage->page); if (user_thread && fatal_signal_pending(current)) goto free_pages; } if (user_thread) clear_current_oom_origin(); return 0; free_pages: list_for_each_entry_safe(bpage, tmp, pages, list) { list_del_init(&bpage->list); free_buffer_page(bpage); } if (user_thread) clear_current_oom_origin(); return -ENOMEM; } static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) { LIST_HEAD(pages); WARN_ON(!nr_pages); if (__rb_allocate_pages(cpu_buffer, nr_pages, &pages)) return -ENOMEM; /* * The ring buffer page list is a circular list that does not * start and end with a list head. All page list items point to * other pages. */ cpu_buffer->pages = pages.next; list_del(&pages); cpu_buffer->nr_pages = nr_pages; rb_check_pages(cpu_buffer); return 0; } static struct ring_buffer_per_cpu * rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *bpage; struct page *page; int ret; cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); if (!cpu_buffer) return NULL; cpu_buffer->cpu = cpu; cpu_buffer->buffer = buffer; raw_spin_lock_init(&cpu_buffer->reader_lock); lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler); init_completion(&cpu_buffer->update_done); init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); init_waitqueue_head(&cpu_buffer->irq_work.waiters); init_waitqueue_head(&cpu_buffer->irq_work.full_waiters); bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); if (!bpage) goto fail_free_buffer; rb_check_bpage(cpu_buffer, bpage); cpu_buffer->reader_page = bpage; page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); if (!page) goto fail_free_reader; bpage->page = page_address(page); rb_init_page(bpage->page); INIT_LIST_HEAD(&cpu_buffer->reader_page->list); INIT_LIST_HEAD(&cpu_buffer->new_pages); ret = rb_allocate_pages(cpu_buffer, nr_pages); if (ret < 0) goto fail_free_reader; cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; rb_head_page_activate(cpu_buffer); return cpu_buffer; fail_free_reader: free_buffer_page(cpu_buffer->reader_page); fail_free_buffer: kfree(cpu_buffer); return NULL; } static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *head = cpu_buffer->pages; struct buffer_page *bpage, *tmp; irq_work_sync(&cpu_buffer->irq_work.work); free_buffer_page(cpu_buffer->reader_page); if (head) { rb_head_page_deactivate(cpu_buffer); list_for_each_entry_safe(bpage, tmp, head, list) { list_del_init(&bpage->list); free_buffer_page(bpage); } bpage = list_entry(head, struct buffer_page, list); free_buffer_page(bpage); } free_page((unsigned long)cpu_buffer->free_page); kfree(cpu_buffer); } /** * __ring_buffer_alloc - allocate a new ring_buffer * @size: the size in bytes per cpu that is needed. * @flags: attributes to set for the ring buffer. * @key: ring buffer reader_lock_key. * * Currently the only flag that is available is the RB_FL_OVERWRITE * flag. This flag means that the buffer will overwrite old data * when the buffer wraps. If this flag is not set, the buffer will * drop data when the tail hits the head. */ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key) { struct trace_buffer *buffer; long nr_pages; int bsize; int cpu; int ret; /* keep it in its own cache line */ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), GFP_KERNEL); if (!buffer) return NULL; if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) goto fail_free_buffer; nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); buffer->flags = flags; buffer->clock = trace_clock_local; buffer->reader_lock_key = key; init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters); init_waitqueue_head(&buffer->irq_work.waiters); /* need at least two pages */ if (nr_pages < 2) nr_pages = 2; buffer->cpus = nr_cpu_ids; bsize = sizeof(void *) * nr_cpu_ids; buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()), GFP_KERNEL); if (!buffer->buffers) goto fail_free_cpumask; cpu = raw_smp_processor_id(); cpumask_set_cpu(cpu, buffer->cpumask); buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); if (!buffer->buffers[cpu]) goto fail_free_buffers; ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); if (ret < 0) goto fail_free_buffers; mutex_init(&buffer->mutex); return buffer; fail_free_buffers: for_each_buffer_cpu(buffer, cpu) { if (buffer->buffers[cpu]) rb_free_cpu_buffer(buffer->buffers[cpu]); } kfree(buffer->buffers); fail_free_cpumask: free_cpumask_var(buffer->cpumask); fail_free_buffer: kfree(buffer); return NULL; } EXPORT_SYMBOL_GPL(__ring_buffer_alloc); struct trace_buffer *ring_buffer_alloc_ext(unsigned long size, struct ring_buffer_ext_cb *cb) { struct trace_buffer *buffer; if (!cb || !cb->update_footers || !cb->swap_reader) return NULL; buffer = ring_buffer_alloc(size, RB_FL_OVERWRITE); if (!buffer) return NULL; WARN_ON(cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node)); buffer->ext_cb = cb; atomic_set(&buffer->record_disabled, 1); return buffer; } /** * ring_buffer_free - free a ring buffer. * @buffer: the buffer to free. */ void ring_buffer_free(struct trace_buffer *buffer) { int cpu; if (!has_ext_writer(buffer)) cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); irq_work_sync(&buffer->irq_work.work); for_each_buffer_cpu(buffer, cpu) rb_free_cpu_buffer(buffer->buffers[cpu]); kfree(buffer->buffers); free_cpumask_var(buffer->cpumask); kfree(buffer); } EXPORT_SYMBOL_GPL(ring_buffer_free); void ring_buffer_set_clock(struct trace_buffer *buffer, u64 (*clock)(void)) { buffer->clock = clock; } void ring_buffer_set_time_stamp_abs(struct trace_buffer *buffer, bool abs) { buffer->time_stamp_abs = abs; } bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer) { return buffer->time_stamp_abs; } static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); static inline unsigned long rb_page_entries(struct buffer_page *bpage) { return local_read(&bpage->entries) & RB_WRITE_MASK; } static inline unsigned long rb_page_write(struct buffer_page *bpage) { return local_read(&bpage->write) & RB_WRITE_MASK; } static int rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) { struct list_head *tail_page, *to_remove, *next_page; struct buffer_page *to_remove_page, *tmp_iter_page; struct buffer_page *last_page, *first_page; unsigned long nr_removed; unsigned long head_bit; int page_entries; head_bit = 0; raw_spin_lock_irq(&cpu_buffer->reader_lock); atomic_inc(&cpu_buffer->record_disabled); /* * We don't race with the readers since we have acquired the reader * lock. We also don't race with writers after disabling recording. * This makes it easy to figure out the first and the last page to be * removed from the list. We unlink all the pages in between including * the first and last pages. This is done in a busy loop so that we * lose the least number of traces. * The pages are freed after we restart recording and unlock readers. */ tail_page = &cpu_buffer->tail_page->list; /* * tail page might be on reader page, we remove the next page * from the ring buffer */ if (cpu_buffer->tail_page == cpu_buffer->reader_page) tail_page = rb_list_head(tail_page->next); to_remove = tail_page; /* start of pages to remove */ first_page = list_entry(rb_list_head(to_remove->next), struct buffer_page, list); for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) { to_remove = rb_list_head(to_remove)->next; head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD; } next_page = rb_list_head(to_remove)->next; /* * Now we remove all pages between tail_page and next_page. * Make sure that we have head_bit value preserved for the * next page */ tail_page->next = (struct list_head *)((unsigned long)next_page | head_bit); next_page = rb_list_head(next_page); next_page->prev = tail_page; /* make sure pages points to a valid page in the ring buffer */ cpu_buffer->pages = next_page; /* update head page */ if (head_bit) cpu_buffer->head_page = list_entry(next_page, struct buffer_page, list); /* * change read pointer to make sure any read iterators reset * themselves */ cpu_buffer->read = 0; /* pages are removed, resume tracing and then free the pages */ atomic_dec(&cpu_buffer->record_disabled); raw_spin_unlock_irq(&cpu_buffer->reader_lock); RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)); /* last buffer page to remove */ last_page = list_entry(rb_list_head(to_remove), struct buffer_page, list); tmp_iter_page = first_page; do { cond_resched(); to_remove_page = tmp_iter_page; rb_inc_page(&tmp_iter_page); /* update the counters */ page_entries = rb_page_entries(to_remove_page); if (page_entries) { /* * If something was added to this page, it was full * since it is not the tail page. So we deduct the * bytes consumed in ring buffer from here. * Increment overrun to account for the lost events. */ local_add(page_entries, &cpu_buffer->overrun); local_sub(rb_page_commit(to_remove_page), &cpu_buffer->entries_bytes); local_inc(&cpu_buffer->pages_lost); } /* * We have already removed references to this list item, just * free up the buffer_page and its page */ free_buffer_page(to_remove_page); nr_removed--; } while (to_remove_page != last_page); RB_WARN_ON(cpu_buffer, nr_removed); return nr_removed == 0; } static int rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *pages = &cpu_buffer->new_pages; int retries, success; raw_spin_lock_irq(&cpu_buffer->reader_lock); /* * We are holding the reader lock, so the reader page won't be swapped * in the ring buffer. Now we are racing with the writer trying to * move head page and the tail page. * We are going to adapt the reader page update process where: * 1. We first splice the start and end of list of new pages between * the head page and its previous page. * 2. We cmpxchg the prev_page->next to point from head page to the * start of new pages list. * 3. Finally, we update the head->prev to the end of new list. * * We will try this process 10 times, to make sure that we don't keep * spinning. */ retries = 10; success = 0; while (retries--) { struct list_head *head_page, *prev_page, *r; struct list_head *last_page, *first_page; struct list_head *head_page_with_bit; head_page = &rb_set_head_page(cpu_buffer)->list; if (!head_page) break; prev_page = head_page->prev; first_page = pages->next; last_page = pages->prev; head_page_with_bit = (struct list_head *) ((unsigned long)head_page | RB_PAGE_HEAD); last_page->next = head_page_with_bit; first_page->prev = prev_page; r = cmpxchg(&prev_page->next, head_page_with_bit, first_page); if (r == head_page_with_bit) { /* * yay, we replaced the page pointer to our new list, * now, we just have to update to head page's prev * pointer to point to end of list */ head_page->prev = last_page; success = 1; break; } } if (success) INIT_LIST_HEAD(pages); /* * If we weren't successful in adding in new pages, warn and stop * tracing */ RB_WARN_ON(cpu_buffer, !success); raw_spin_unlock_irq(&cpu_buffer->reader_lock); /* free pages if they weren't inserted */ if (!success) { struct buffer_page *bpage, *tmp; list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, list) { list_del_init(&bpage->list); free_buffer_page(bpage); } } return success; } static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer) { int success; if (cpu_buffer->nr_pages_to_update > 0) success = rb_insert_pages(cpu_buffer); else success = rb_remove_pages(cpu_buffer, -cpu_buffer->nr_pages_to_update); if (success) cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update; } static void update_pages_handler(struct work_struct *work) { struct ring_buffer_per_cpu *cpu_buffer = container_of(work, struct ring_buffer_per_cpu, update_pages_work); rb_update_pages(cpu_buffer); complete(&cpu_buffer->update_done); } /** * ring_buffer_resize - resize the ring buffer * @buffer: the buffer to resize. * @size: the new size. * @cpu_id: the cpu buffer to resize * * Minimum size is 2 * BUF_PAGE_SIZE. * * Returns 0 on success and < 0 on failure. */ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, int cpu_id) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long nr_pages; int cpu, err; if (unlikely(has_ext_writer(buffer))) return -EINVAL; /* * Always succeed at resizing a non-existent buffer: */ if (!buffer) return 0; /* Make sure the requested buffer exists */ if (cpu_id != RING_BUFFER_ALL_CPUS && !cpumask_test_cpu(cpu_id, buffer->cpumask)) return 0; nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); /* we need a minimum of two pages */ if (nr_pages < 2) nr_pages = 2; /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); atomic_inc(&buffer->resizing); if (cpu_id == RING_BUFFER_ALL_CPUS) { /* * Don't succeed if resizing is disabled, as a reader might be * manipulating the ring buffer and is expecting a sane state while * this is true. */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; if (atomic_read(&cpu_buffer->resize_disabled)) { err = -EBUSY; goto out_err_unlock; } } /* calculate the pages to update */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; cpu_buffer->nr_pages_to_update = nr_pages - cpu_buffer->nr_pages; /* * nothing more to do for removing pages or no update */ if (cpu_buffer->nr_pages_to_update <= 0) continue; /* * to add pages, make sure all new pages can be * allocated without receiving ENOMEM */ INIT_LIST_HEAD(&cpu_buffer->new_pages); if (__rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update, &cpu_buffer->new_pages)) { /* not enough memory for new pages */ err = -ENOMEM; goto out_err; } cond_resched(); } cpus_read_lock(); /* * Fire off all the required work handlers * We can't schedule on offline CPUs, but it's not necessary * since we can change their buffer sizes without any race. */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; if (!cpu_buffer->nr_pages_to_update) continue; /* Can't run something on an offline CPU. */ if (!cpu_online(cpu)) { rb_update_pages(cpu_buffer); cpu_buffer->nr_pages_to_update = 0; } else { schedule_work_on(cpu, &cpu_buffer->update_pages_work); } } /* wait for all the updates to complete */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; if (!cpu_buffer->nr_pages_to_update) continue; if (cpu_online(cpu)) wait_for_completion(&cpu_buffer->update_done); cpu_buffer->nr_pages_to_update = 0; } cpus_read_unlock(); } else { cpu_buffer = buffer->buffers[cpu_id]; if (nr_pages == cpu_buffer->nr_pages) goto out; /* * Don't succeed if resizing is disabled, as a reader might be * manipulating the ring buffer and is expecting a sane state while * this is true. */ if (atomic_read(&cpu_buffer->resize_disabled)) { err = -EBUSY; goto out_err_unlock; } cpu_buffer->nr_pages_to_update = nr_pages - cpu_buffer->nr_pages; INIT_LIST_HEAD(&cpu_buffer->new_pages); if (cpu_buffer->nr_pages_to_update > 0 && __rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update, &cpu_buffer->new_pages)) { err = -ENOMEM; goto out_err; } cpus_read_lock(); /* Can't run something on an offline CPU. */ if (!cpu_online(cpu_id)) rb_update_pages(cpu_buffer); else { schedule_work_on(cpu_id, &cpu_buffer->update_pages_work); wait_for_completion(&cpu_buffer->update_done); } cpu_buffer->nr_pages_to_update = 0; cpus_read_unlock(); } out: /* * The ring buffer resize can happen with the ring buffer * enabled, so that the update disturbs the tracing as little * as possible. But if the buffer is disabled, we do not need * to worry about that, and we can take the time to verify * that the buffer is not corrupt. */ if (atomic_read(&buffer->record_disabled)) { atomic_inc(&buffer->record_disabled); /* * Even though the buffer was disabled, we must make sure * that it is truly disabled before calling rb_check_pages. * There could have been a race between checking * record_disable and incrementing it. */ synchronize_rcu(); for_each_buffer_cpu(buffer, cpu) { unsigned long flags; cpu_buffer = buffer->buffers[cpu]; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_check_pages(cpu_buffer); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } atomic_dec(&buffer->record_disabled); } atomic_dec(&buffer->resizing); mutex_unlock(&buffer->mutex); return 0; out_err: for_each_buffer_cpu(buffer, cpu) { struct buffer_page *bpage, *tmp; cpu_buffer = buffer->buffers[cpu]; cpu_buffer->nr_pages_to_update = 0; if (list_empty(&cpu_buffer->new_pages)) continue; list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, list) { list_del_init(&bpage->list); free_buffer_page(bpage); } } out_err_unlock: atomic_dec(&buffer->resizing); mutex_unlock(&buffer->mutex); return err; } EXPORT_SYMBOL_GPL(ring_buffer_resize); void ring_buffer_change_overwrite(struct trace_buffer *buffer, int val) { mutex_lock(&buffer->mutex); if (val) buffer->flags |= RB_FL_OVERWRITE; else buffer->flags &= ~RB_FL_OVERWRITE; mutex_unlock(&buffer->mutex); } EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) { return bpage->page->data + index; } static __always_inline struct ring_buffer_event * rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) { return __rb_page_index(cpu_buffer->reader_page, cpu_buffer->reader_page->read); } static struct ring_buffer_event * rb_iter_head_event(struct ring_buffer_iter *iter) { struct ring_buffer_event *event; struct buffer_page *iter_head_page = iter->head_page; unsigned long commit; unsigned length; if (iter->head != iter->next_event) return iter->event; /* * When the writer goes across pages, it issues a cmpxchg which * is a mb(), which will synchronize with the rmb here. * (see rb_tail_page_update() and __rb_reserve_next()) */ commit = rb_page_commit(iter_head_page); smp_rmb(); /* An event needs to be at least 8 bytes in size */ if (iter->head > commit - 8) goto reset; event = __rb_page_index(iter_head_page, iter->head); length = rb_event_length(event); /* * READ_ONCE() doesn't work on functions and we don't want the * compiler doing any crazy optimizations with length. */ barrier(); if ((iter->head + length) > commit || length > BUF_PAGE_SIZE) /* Writer corrupted the read? */ goto reset; memcpy(iter->event, event, length); /* * If the page stamp is still the same after this rmb() then the * event was safely copied without the writer entering the page. */ smp_rmb(); /* Make sure the page didn't change since we read this */ if (iter->page_stamp != iter_head_page->page->time_stamp || commit > rb_page_commit(iter_head_page)) goto reset; iter->next_event = iter->head + length; return iter->event; reset: /* Reset to the beginning */ iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp; iter->head = 0; iter->next_event = 0; iter->missed_events = 1; return NULL; } /* Size is determined by what has been committed */ static __always_inline unsigned rb_page_size(struct buffer_page *bpage) { return rb_page_commit(bpage); } static __always_inline unsigned rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) { return rb_page_commit(cpu_buffer->commit_page); } static __always_inline unsigned rb_event_index(struct ring_buffer_event *event) { unsigned long addr = (unsigned long)event; return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; } static void rb_inc_iter(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; /* * The iterator could be on the reader page (it starts there). * But the head could have moved, since the reader was * found. Check for this case and assign the iterator * to the head page instead of next. */ if (iter->head_page == cpu_buffer->reader_page) iter->head_page = rb_set_head_page(cpu_buffer); else rb_inc_page(&iter->head_page); iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp; iter->head = 0; iter->next_event = 0; } /* * rb_handle_head_page - writer hit the head page * * Returns: +1 to retry page * 0 to continue * -1 on error */ static int rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *tail_page, struct buffer_page *next_page) { struct buffer_page *new_head; int entries; int type; int ret; entries = rb_page_entries(next_page); /* * The hard part is here. We need to move the head * forward, and protect against both readers on * other CPUs and writers coming in via interrupts. */ type = rb_head_page_set_update(cpu_buffer, next_page, tail_page, RB_PAGE_HEAD); /* * type can be one of four: * NORMAL - an interrupt already moved it for us * HEAD - we are the first to get here. * UPDATE - we are the interrupt interrupting * a current move. * MOVED - a reader on another CPU moved the next * pointer to its reader page. Give up * and try again. */ switch (type) { case RB_PAGE_HEAD: /* * We changed the head to UPDATE, thus * it is our responsibility to update * the counters. */ local_add(entries, &cpu_buffer->overrun); local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes); local_inc(&cpu_buffer->pages_lost); /* * The entries will be zeroed out when we move the * tail page. */ /* still more to do */ break; case RB_PAGE_UPDATE: /* * This is an interrupt that interrupt the * previous update. Still more to do. */ break; case RB_PAGE_NORMAL: /* * An interrupt came in before the update * and processed this for us. * Nothing left to do. */ return 1; case RB_PAGE_MOVED: /* * The reader is on another CPU and just did * a swap with our next_page. * Try again. */ return 1; default: RB_WARN_ON(cpu_buffer, 1); /* WTF??? */ return -1; } /* * Now that we are here, the old head pointer is * set to UPDATE. This will keep the reader from * swapping the head page with the reader page. * The reader (on another CPU) will spin till * we are finished. * * We just need to protect against interrupts * doing the job. We will set the next pointer * to HEAD. After that, we set the old pointer * to NORMAL, but only if it was HEAD before. * otherwise we are an interrupt, and only * want the outer most commit to reset it. */ new_head = next_page; rb_inc_page(&new_head); ret = rb_head_page_set_head(cpu_buffer, new_head, next_page, RB_PAGE_NORMAL); /* * Valid returns are: * HEAD - an interrupt came in and already set it. * NORMAL - One of two things: * 1) We really set it. * 2) A bunch of interrupts came in and moved * the page forward again. */ switch (ret) { case RB_PAGE_HEAD: case RB_PAGE_NORMAL: /* OK */ break; default: RB_WARN_ON(cpu_buffer, 1); return -1; } /* * It is possible that an interrupt came in, * set the head up, then more interrupts came in * and moved it again. When we get back here, * the page would have been set to NORMAL but we * just set it back to HEAD. * * How do you detect this? Well, if that happened * the tail page would have moved. */ if (ret == RB_PAGE_NORMAL) { struct buffer_page *buffer_tail_page; buffer_tail_page = READ_ONCE(cpu_buffer->tail_page); /* * If the tail had moved passed next, then we need * to reset the pointer. */ if (buffer_tail_page != tail_page && buffer_tail_page != next_page) rb_head_page_set_normal(cpu_buffer, new_head, next_page, RB_PAGE_HEAD); } /* * If this was the outer most commit (the one that * changed the original pointer from HEAD to UPDATE), * then it is up to us to reset it to NORMAL. */ if (type == RB_PAGE_HEAD) { ret = rb_head_page_set_normal(cpu_buffer, next_page, tail_page, RB_PAGE_UPDATE); if (RB_WARN_ON(cpu_buffer, ret != RB_PAGE_UPDATE)) return -1; } return 0; } static inline void rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long tail, struct rb_event_info *info) { struct buffer_page *tail_page = info->tail_page; struct ring_buffer_event *event; unsigned long length = info->length; /* * Only the event that crossed the page boundary * must fill the old tail_page with padding. */ if (tail >= BUF_PAGE_SIZE) { /* * If the page was filled, then we still need * to update the real_end. Reset it to zero * and the reader will ignore it. */ if (tail == BUF_PAGE_SIZE) tail_page->real_end = 0; local_sub(length, &tail_page->write); return; } event = __rb_page_index(tail_page, tail); /* * Save the original length to the meta data. * This will be used by the reader to add lost event * counter. */ tail_page->real_end = tail; /* * If this event is bigger than the minimum size, then * we need to be careful that we don't subtract the * write counter enough to allow another writer to slip * in on this page. * We put in a discarded commit instead, to make sure * that this space is not used again, and this space will * not be accounted into 'entries_bytes'. * * If we are less than the minimum size, we don't need to * worry about it. */ if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) { /* No room for any events */ /* Mark the rest of the page with padding */ rb_event_set_padding(event); /* Make sure the padding is visible before the write update */ smp_wmb(); /* Set the write back to the previous setting */ local_sub(length, &tail_page->write); return; } /* Put in a discarded event */ event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE; event->type_len = RINGBUF_TYPE_PADDING; /* time delta must be non zero */ event->time_delta = 1; /* account for padding bytes */ local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); /* Make sure the padding is visible before the tail_page->write update */ smp_wmb(); /* Set write to end of buffer */ length = (tail + length) - BUF_PAGE_SIZE; local_sub(length, &tail_page->write); } static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer); /* * This is the slow path, force gcc not to inline it. */ static noinline struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long tail, struct rb_event_info *info) { struct buffer_page *tail_page = info->tail_page; struct buffer_page *commit_page = cpu_buffer->commit_page; struct trace_buffer *buffer = cpu_buffer->buffer; struct buffer_page *next_page; int ret; next_page = tail_page; rb_inc_page(&next_page); /* * If for some reason, we had an interrupt storm that made * it all the way around the buffer, bail, and warn * about it. */ if (unlikely(next_page == commit_page)) { local_inc(&cpu_buffer->commit_overrun); goto out_reset; } /* * This is where the fun begins! * * We are fighting against races between a reader that * could be on another CPU trying to swap its reader * page with the buffer head. * * We are also fighting against interrupts coming in and * moving the head or tail on us as well. * * If the next page is the head page then we have filled * the buffer, unless the commit page is still on the * reader page. */ if (rb_is_head_page(next_page, &tail_page->list)) { /* * If the commit is not on the reader page, then * move the header page. */ if (!rb_is_reader_page(cpu_buffer->commit_page)) { /* * If we are not in overwrite mode, * this is easy, just stop here. */ if (!(buffer->flags & RB_FL_OVERWRITE)) { local_inc(&cpu_buffer->dropped_events); goto out_reset; } ret = rb_handle_head_page(cpu_buffer, tail_page, next_page); if (ret < 0) goto out_reset; if (ret) goto out_again; } else { /* * We need to be careful here too. The * commit page could still be on the reader * page. We could have a small buffer, and * have filled up the buffer with events * from interrupts and such, and wrapped. * * Note, if the tail page is also on the * reader_page, we let it move out. */ if (unlikely((cpu_buffer->commit_page != cpu_buffer->tail_page) && (cpu_buffer->commit_page == cpu_buffer->reader_page))) { local_inc(&cpu_buffer->commit_overrun); goto out_reset; } } } rb_tail_page_update(cpu_buffer, tail_page, next_page); out_again: rb_reset_tail(cpu_buffer, tail, info); /* Commit what we have for now. */ rb_end_commit(cpu_buffer); /* rb_end_commit() decs committing */ local_inc(&cpu_buffer->committing); /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); out_reset: /* reset write */ rb_reset_tail(cpu_buffer, tail, info); return NULL; } /* Slow path */ static struct ring_buffer_event * rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) { if (abs) event->type_len = RINGBUF_TYPE_TIME_STAMP; else event->type_len = RINGBUF_TYPE_TIME_EXTEND; /* Not the first event on the page, or not delta? */ if (abs || rb_event_index(event)) { event->time_delta = delta & TS_MASK; event->array[0] = delta >> TS_SHIFT; } else { /* nope, just zero it */ event->time_delta = 0; event->array[0] = 0; } return skip_time_extend(event); } #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK static inline bool sched_clock_stable(void) { return true; } #endif static void rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) { u64 write_stamp; WARN_ONCE(1, "Delta way too big! %llu ts=%llu before=%llu after=%llu write stamp=%llu\n%s", (unsigned long long)info->delta, (unsigned long long)info->ts, (unsigned long long)info->before, (unsigned long long)info->after, (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0), sched_clock_stable() ? "" : "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" " echo global > /sys/kernel/debug/tracing/trace_clock\n" "or add trace_clock=global to the kernel command line\n"); } static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event **event, struct rb_event_info *info, u64 *delta, unsigned int *length) { bool abs = info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE); if (unlikely(info->delta > (1ULL << 59))) { /* * Some timers can use more than 59 bits, and when a timestamp * is added to the buffer, it will lose those bits. */ if (abs && (info->ts & TS_MSB)) { info->delta &= ABS_TS_MASK; /* did the clock go backwards */ } else if (info->before == info->after && info->before > info->ts) { /* not interrupted */ static int once; /* * This is possible with a recalibrating of the TSC. * Do not produce a call stack, but just report it. */ if (!once) { once++; pr_warn("Ring buffer clock went backwards: %llu -> %llu\n", info->before, info->ts); } } else rb_check_timestamp(cpu_buffer, info); if (!abs) info->delta = 0; } *event = rb_add_time_stamp(*event, info->delta, abs); *length -= RB_LEN_TIME_EXTEND; *delta = 0; } /** * rb_update_event - update event type and data * @cpu_buffer: The per cpu buffer of the @event * @event: the event to update * @info: The info to update the @event with (contains length and delta) * * Update the type and data fields of the @event. The length * is the actual size that is written to the ring buffer, * and with this, we can determine what to place into the * data field. */ static void rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event, struct rb_event_info *info) { unsigned length = info->length; u64 delta = info->delta; unsigned int nest = local_read(&cpu_buffer->committing) - 1; if (!WARN_ON_ONCE(nest >= MAX_NEST)) cpu_buffer->event_stamp[nest] = info->ts; /* * If we need to add a timestamp, then we * add it to the start of the reserved space. */ if (unlikely(info->add_timestamp)) rb_add_timestamp(cpu_buffer, &event, info, &delta, &length); event->time_delta = delta; length -= RB_EVNT_HDR_SIZE; if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { event->type_len = 0; event->array[0] = length; } else event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); } static unsigned rb_calculate_event_length(unsigned length) { struct ring_buffer_event event; /* Used only for sizeof array */ /* zero length can cause confusions */ if (!length) length++; if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) length += sizeof(event.array[0]); length += RB_EVNT_HDR_SIZE; length = ALIGN(length, RB_ARCH_ALIGNMENT); /* * In case the time delta is larger than the 27 bits for it * in the header, we need to add a timestamp. If another * event comes in when trying to discard this one to increase * the length, then the timestamp will be added in the allocated * space of this event. If length is bigger than the size needed * for the TIME_EXTEND, then padding has to be used. The events * length must be either RB_LEN_TIME_EXTEND, or greater than or equal * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding. * As length is a multiple of 4, we only need to worry if it * is 12 (RB_LEN_TIME_EXTEND + 4). */ if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT) length += RB_ALIGNMENT; return length; } static inline int rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { unsigned long new_index, old_index; struct buffer_page *bpage; unsigned long index; unsigned long addr; new_index = rb_event_index(event); old_index = new_index + rb_event_ts_length(event); addr = (unsigned long)event; addr &= PAGE_MASK; bpage = READ_ONCE(cpu_buffer->tail_page); /* * Make sure the tail_page is still the same and * the next write location is the end of this event */ if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { unsigned long write_mask = local_read(&bpage->write) & ~RB_WRITE_MASK; unsigned long event_length = rb_event_length(event); /* * For the before_stamp to be different than the write_stamp * to make sure that the next event adds an absolute * value and does not rely on the saved write stamp, which * is now going to be bogus. * * By setting the before_stamp to zero, the next event * is not going to use the write_stamp and will instead * create an absolute timestamp. This means there's no * reason to update the wirte_stamp! */ rb_time_set(&cpu_buffer->before_stamp, 0); /* * If an event were to come in now, it would see that the * write_stamp and the before_stamp are different, and assume * that this event just added itself before updating * the write stamp. The interrupting event will fix the * write stamp for us, and use an absolute timestamp. */ /* * This is on the tail page. It is possible that * a write could come in and move the tail page * and write to the next page. That is fine * because we just shorten what is on this page. */ old_index += write_mask; new_index += write_mask; index = local_cmpxchg(&bpage->write, old_index, new_index); if (index == old_index) { /* update counters */ local_sub(event_length, &cpu_buffer->entries_bytes); return 1; } } /* could not discard */ return 0; } static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) { local_inc(&cpu_buffer->committing); local_inc(&cpu_buffer->commits); } static __always_inline void rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long max_count; /* * We only race with interrupts and NMIs on this CPU. * If we own the commit event, then we can commit * all others that interrupted us, since the interruptions * are in stack format (they finish before they come * back to us). This allows us to do a simple loop to * assign the commit to the tail. */ again: max_count = cpu_buffer->nr_pages * 100; while (cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page)) { if (RB_WARN_ON(cpu_buffer, !(--max_count))) return; if (RB_WARN_ON(cpu_buffer, rb_is_reader_page(cpu_buffer->tail_page))) return; /* * No need for a memory barrier here, as the update * of the tail_page did it for this page. */ local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); rb_inc_page(&cpu_buffer->commit_page); /* add barrier to keep gcc from optimizing too much */ barrier(); } while (rb_commit_index(cpu_buffer) != rb_page_write(cpu_buffer->commit_page)) { /* Make sure the readers see the content of what is committed. */ smp_wmb(); local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->commit_page->page->commit) & ~RB_WRITE_MASK); barrier(); } /* again, keep gcc from optimizing */ barrier(); /* * If an interrupt came in just after the first while loop * and pushed the tail page forward, we will be left with * a dangling commit that will never go forward. */ if (unlikely(cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page))) goto again; } static __always_inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long commits; if (RB_WARN_ON(cpu_buffer, !local_read(&cpu_buffer->committing))) return; again: commits = local_read(&cpu_buffer->commits); /* synchronize with interrupts */ barrier(); if (local_read(&cpu_buffer->committing) == 1) rb_set_commit_to_write(cpu_buffer); local_dec(&cpu_buffer->committing); /* synchronize with interrupts */ barrier(); /* * Need to account for interrupts coming in between the * updating of the commit page and the clearing of the * committing counter. */ if (unlikely(local_read(&cpu_buffer->commits) != commits) && !local_read(&cpu_buffer->committing)) { local_inc(&cpu_buffer->committing); goto again; } } static inline void rb_event_discard(struct ring_buffer_event *event) { if (extended_time(event)) event = skip_time_extend(event); /* array[0] holds the actual length for the discarded event */ event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; event->type_len = RINGBUF_TYPE_PADDING; /* time delta must be non zero */ if (!event->time_delta) event->time_delta = 1; } static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { local_inc(&cpu_buffer->entries); rb_end_commit(cpu_buffer); } static __always_inline void rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) { if (buffer->irq_work.waiters_pending) { buffer->irq_work.waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ irq_work_queue(&buffer->irq_work.work); } if (cpu_buffer->irq_work.waiters_pending) { cpu_buffer->irq_work.waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ irq_work_queue(&cpu_buffer->irq_work.work); } if (cpu_buffer->last_pages_touch == local_read(&cpu_buffer->pages_touched)) return; if (cpu_buffer->reader_page == cpu_buffer->commit_page) return; if (!cpu_buffer->irq_work.full_waiters_pending) return; cpu_buffer->last_pages_touch = local_read(&cpu_buffer->pages_touched); if (!full_hit(buffer, cpu_buffer->cpu, cpu_buffer->shortest_full)) return; cpu_buffer->irq_work.wakeup_full = true; cpu_buffer->irq_work.full_waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ irq_work_queue(&cpu_buffer->irq_work.work); } #ifdef CONFIG_RING_BUFFER_RECORD_RECURSION # define do_ring_buffer_record_recursion() \ do_ftrace_record_recursion(_THIS_IP_, _RET_IP_) #else # define do_ring_buffer_record_recursion() do { } while (0) #endif /* * The lock and unlock are done within a preempt disable section. * The current_context per_cpu variable can only be modified * by the current task between lock and unlock. But it can * be modified more than once via an interrupt. To pass this * information from the lock to the unlock without having to * access the 'in_interrupt()' functions again (which do show * a bit of overhead in something as critical as function tracing, * we use a bitmask trick. * * bit 1 = NMI context * bit 2 = IRQ context * bit 3 = SoftIRQ context * bit 4 = normal context. * * This works because this is the order of contexts that can * preempt other contexts. A SoftIRQ never preempts an IRQ * context. * * When the context is determined, the corresponding bit is * checked and set (if it was set, then a recursion of that context * happened). * * On unlock, we need to clear this bit. To do so, just subtract * 1 from the current_context and AND it to itself. * * (binary) * 101 - 1 = 100 * 101 & 100 = 100 (clearing bit zero) * * 1010 - 1 = 1001 * 1010 & 1001 = 1000 (clearing bit 1) * * The least significant bit can be cleared this way, and it * just so happens that it is the same bit corresponding to * the current context. * * Now the TRANSITION bit breaks the above slightly. The TRANSITION bit * is set when a recursion is detected at the current context, and if * the TRANSITION bit is already set, it will fail the recursion. * This is needed because there's a lag between the changing of * interrupt context and updating the preempt count. In this case, * a false positive will be found. To handle this, one extra recursion * is allowed, and this is done by the TRANSITION bit. If the TRANSITION * bit is already set, then it is considered a recursion and the function * ends. Otherwise, the TRANSITION bit is set, and that bit is returned. * * On the trace_recursive_unlock(), the TRANSITION bit will be the first * to be cleared. Even if it wasn't the context that set it. That is, * if an interrupt comes in while NORMAL bit is set and the ring buffer * is called before preempt_count() is updated, since the check will * be on the NORMAL bit, the TRANSITION bit will then be set. If an * NMI then comes in, it will set the NMI bit, but when the NMI code * does the trace_recursive_unlock() it will clear the TRANSITION bit * and leave the NMI bit set. But this is fine, because the interrupt * code that set the TRANSITION bit will then clear the NMI bit when it * calls trace_recursive_unlock(). If another NMI comes in, it will * set the TRANSITION bit and continue. * * Note: The TRANSITION bit only handles a single transition between context. */ static __always_inline int trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) { unsigned int val = cpu_buffer->current_context; int bit = interrupt_context_level(); bit = RB_CTX_NORMAL - bit; if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) { /* * It is possible that this was called by transitioning * between interrupt context, and preempt_count() has not * been updated yet. In this case, use the TRANSITION bit. */ bit = RB_CTX_TRANSITION; if (val & (1 << (bit + cpu_buffer->nest))) { do_ring_buffer_record_recursion(); return 1; } } val |= (1 << (bit + cpu_buffer->nest)); cpu_buffer->current_context = val; return 0; } static __always_inline void trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) { cpu_buffer->current_context &= cpu_buffer->current_context - (1 << cpu_buffer->nest); } /* The recursive locking above uses 5 bits */ #define NESTED_BITS 5 /** * ring_buffer_nest_start - Allow to trace while nested * @buffer: The ring buffer to modify * * The ring buffer has a safety mechanism to prevent recursion. * But there may be a case where a trace needs to be done while * tracing something else. In this case, calling this function * will allow this function to nest within a currently active * ring_buffer_lock_reserve(). * * Call this function before calling another ring_buffer_lock_reserve() and * call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit(). */ void ring_buffer_nest_start(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; int cpu; /* Enabled by ring_buffer_nest_end() */ preempt_disable_notrace(); cpu = raw_smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; /* This is the shift value for the above recursive locking */ cpu_buffer->nest += NESTED_BITS; } /** * ring_buffer_nest_end - Allow to trace while nested * @buffer: The ring buffer to modify * * Must be called after ring_buffer_nest_start() and after the * ring_buffer_unlock_commit(). */ void ring_buffer_nest_end(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; int cpu; /* disabled by ring_buffer_nest_start() */ cpu = raw_smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; /* This is the shift value for the above recursive locking */ cpu_buffer->nest -= NESTED_BITS; preempt_enable_notrace(); } /** * ring_buffer_unlock_commit - commit a reserved * @buffer: The buffer to commit to * @event: The event pointer to commit. * * This commits the data to the ring buffer, and releases any locks held. * * Must be paired with ring_buffer_lock_reserve. */ int ring_buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) { struct ring_buffer_per_cpu *cpu_buffer; int cpu = raw_smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; rb_commit(cpu_buffer, event); rb_wakeups(buffer, cpu_buffer); trace_recursive_unlock(cpu_buffer); preempt_enable_notrace(); return 0; } EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); /* Special value to validate all deltas on a page. */ #define CHECK_FULL_PAGE 1L #ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS static void dump_buffer_page(struct buffer_data_page *bpage, struct rb_event_info *info, unsigned long tail) { struct ring_buffer_event *event; u64 ts, delta; int e; ts = bpage->time_stamp; pr_warn(" [%lld] PAGE TIME STAMP\n", ts); for (e = 0; e < tail; e += rb_event_length(event)) { event = (struct ring_buffer_event *)(bpage->data + e); switch (event->type_len) { case RINGBUF_TYPE_TIME_EXTEND: delta = rb_event_time_stamp(event); ts += delta; pr_warn(" [%lld] delta:%lld TIME EXTEND\n", ts, delta); break; case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); ts = rb_fix_abs_ts(delta, ts); pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta); break; case RINGBUF_TYPE_PADDING: ts += event->time_delta; pr_warn(" [%lld] delta:%d PADDING\n", ts, event->time_delta); break; case RINGBUF_TYPE_DATA: ts += event->time_delta; pr_warn(" [%lld] delta:%d\n", ts, event->time_delta); break; default: break; } } } static DEFINE_PER_CPU(atomic_t, checking); static atomic_t ts_dump; /* * Check if the current event time stamp matches the deltas on * the buffer page. */ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info, unsigned long tail) { struct ring_buffer_event *event; struct buffer_data_page *bpage; u64 ts, delta; bool full = false; int e; bpage = info->tail_page->page; if (tail == CHECK_FULL_PAGE) { full = true; tail = local_read(&bpage->commit); } else if (info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)) { /* Ignore events with absolute time stamps */ return; } /* * Do not check the first event (skip possible extends too). * Also do not check if previous events have not been committed. */ if (tail <= 8 || tail > local_read(&bpage->commit)) return; /* * If this interrupted another event, */ if (atomic_inc_return(this_cpu_ptr(&checking)) != 1) goto out; ts = bpage->time_stamp; for (e = 0; e < tail; e += rb_event_length(event)) { event = (struct ring_buffer_event *)(bpage->data + e); switch (event->type_len) { case RINGBUF_TYPE_TIME_EXTEND: delta = rb_event_time_stamp(event); ts += delta; break; case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); ts = rb_fix_abs_ts(delta, ts); break; case RINGBUF_TYPE_PADDING: if (event->time_delta == 1) break; fallthrough; case RINGBUF_TYPE_DATA: ts += event->time_delta; break; default: RB_WARN_ON(cpu_buffer, 1); } } if ((full && ts > info->ts) || (!full && ts + info->delta != info->ts)) { /* If another report is happening, ignore this one */ if (atomic_inc_return(&ts_dump) != 1) { atomic_dec(&ts_dump); goto out; } atomic_inc(&cpu_buffer->record_disabled); /* There's some cases in boot up that this can happen */ WARN_ON_ONCE(system_state != SYSTEM_BOOTING); pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s\n", cpu_buffer->cpu, ts + info->delta, info->ts, info->delta, info->before, info->after, full ? " (full)" : ""); dump_buffer_page(bpage, info, tail); atomic_dec(&ts_dump); /* Do not re-enable checking */ return; } out: atomic_dec(this_cpu_ptr(&checking)); } #else static inline void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info, unsigned long tail) { } #endif /* CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS */ static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) { struct ring_buffer_event *event; struct buffer_page *tail_page; unsigned long tail, write, w; bool a_ok; bool b_ok; /* Don't let the compiler play games with cpu_buffer->tail_page */ tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page); /*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK; barrier(); b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); barrier(); info->ts = rb_time_stamp(cpu_buffer->buffer); if ((info->add_timestamp & RB_ADD_STAMP_ABSOLUTE)) { info->delta = info->ts; } else { /* * If interrupting an event time update, we may need an * absolute timestamp. * Don't bother if this is the start of a new page (w == 0). */ if (!w) { /* Use the sub-buffer timestamp */ info->delta = 0; } else if (unlikely(!a_ok || !b_ok || info->before != info->after)) { info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND; info->length += RB_LEN_TIME_EXTEND; } else { info->delta = info->ts - info->after; if (unlikely(test_time_stamp(info->delta))) { info->add_timestamp |= RB_ADD_STAMP_EXTEND; info->length += RB_LEN_TIME_EXTEND; } } } /*B*/ rb_time_set(&cpu_buffer->before_stamp, info->ts); /*C*/ write = local_add_return(info->length, &tail_page->write); /* set write to only the index of the write */ write &= RB_WRITE_MASK; tail = write - info->length; /* See if we shot pass the end of this buffer page */ if (unlikely(write > BUF_PAGE_SIZE)) { check_buffer(cpu_buffer, info, CHECK_FULL_PAGE); return rb_move_tail(cpu_buffer, tail, info); } if (likely(tail == w)) { /* Nothing interrupted us between A and C */ /*D*/ rb_time_set(&cpu_buffer->write_stamp, info->ts); /* * If something came in between C and D, the write stamp * may now not be in sync. But that's fine as the before_stamp * will be different and then next event will just be forced * to use an absolute timestamp. */ if (likely(!(info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)))) /* This did not interrupt any time update */ info->delta = info->ts - info->after; else /* Just use full timestamp for interrupting event */ info->delta = info->ts; check_buffer(cpu_buffer, info, tail); } else { u64 ts; /* SLOW PATH - Interrupted between A and C */ /* Save the old before_stamp */ a_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); RB_WARN_ON(cpu_buffer, !a_ok); /* * Read a new timestamp and update the before_stamp to make * the next event after this one force using an absolute * timestamp. This is in case an interrupt were to come in * between E and F. */ ts = rb_time_stamp(cpu_buffer->buffer); rb_time_set(&cpu_buffer->before_stamp, ts); barrier(); /*E*/ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); /* Was interrupted before here, write_stamp must be valid */ RB_WARN_ON(cpu_buffer, !a_ok); barrier(); /*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && info->after == info->before && info->after < ts) { /* * Nothing came after this event between C and F, it is * safe to use info->after for the delta as it * matched info->before and is still valid. */ info->delta = ts - info->after; } else { /* * Interrupted between C and F: * Lost the previous events time stamp. Just set the * delta to zero, and this will be the same time as * the event this event interrupted. And the events that * came after this will still be correct (as they would * have built their delta on the previous event. */ info->delta = 0; } info->ts = ts; info->add_timestamp &= ~RB_ADD_STAMP_FORCE; } /* * If this is the first commit on the page, then it has the same * timestamp as the page itself. */ if (unlikely(!tail && !(info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)))) info->delta = 0; /* We reserved something on the buffer */ event = __rb_page_index(tail_page, tail); rb_update_event(cpu_buffer, event, info); local_inc(&tail_page->entries); /* * If this is the first commit on the page, then update * its timestamp. */ if (unlikely(!tail)) tail_page->page->time_stamp = info->ts; /* account for these added bytes */ local_add(info->length, &cpu_buffer->entries_bytes); return event; } static __always_inline struct ring_buffer_event * rb_reserve_next_event(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer, unsigned long length) { struct ring_buffer_event *event; struct rb_event_info info; int nr_loops = 0; int add_ts_default; /* ring buffer does cmpxchg, make sure it is safe in NMI context */ if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) && (unlikely(in_nmi()))) { return NULL; } rb_start_commit(cpu_buffer); /* The commit page can not change after this */ #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP /* * Due to the ability to swap a cpu buffer from a buffer * it is possible it was swapped before we committed. * (committing stops a swap). We check for it here and * if it happened, we have to fail the write. */ barrier(); if (unlikely(READ_ONCE(cpu_buffer->buffer) != buffer)) { local_dec(&cpu_buffer->committing); local_dec(&cpu_buffer->commits); return NULL; } #endif info.length = rb_calculate_event_length(length); if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) { add_ts_default = RB_ADD_STAMP_ABSOLUTE; info.length += RB_LEN_TIME_EXTEND; if (info.length > BUF_MAX_DATA_SIZE) goto out_fail; } else { add_ts_default = RB_ADD_STAMP_NONE; } again: info.add_timestamp = add_ts_default; info.delta = 0; /* * We allow for interrupts to reenter here and do a trace. * If one does, it will cause this original code to loop * back here. Even with heavy interrupts happening, this * should only happen a few times in a row. If this happens * 1000 times in a row, there must be either an interrupt * storm or we have something buggy. * Bail! */ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) goto out_fail; event = __rb_reserve_next(cpu_buffer, &info); if (unlikely(PTR_ERR(event) == -EAGAIN)) { if (info.add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND)) info.length -= RB_LEN_TIME_EXTEND; goto again; } if (likely(event)) return event; out_fail: rb_end_commit(cpu_buffer); return NULL; } /** * ring_buffer_lock_reserve - reserve a part of the buffer * @buffer: the ring buffer to reserve from * @length: the length of the data to reserve (excluding event header) * * Returns a reserved event on the ring buffer to copy directly to. * The user of this interface will need to get the body to write into * and can use the ring_buffer_event_data() interface. * * The length is the length of the data needed, not the event length * which also includes the event header. * * Must be paired with ring_buffer_unlock_commit, unless NULL is returned. * If NULL is returned, then nothing has been allocated or locked. */ struct ring_buffer_event * ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; int cpu; /* If we are tracing schedule, we don't want to recurse */ preempt_disable_notrace(); if (unlikely(atomic_read(&buffer->record_disabled))) goto out; cpu = raw_smp_processor_id(); if (unlikely(!cpumask_test_cpu(cpu, buffer->cpumask))) goto out; cpu_buffer = buffer->buffers[cpu]; if (unlikely(atomic_read(&cpu_buffer->record_disabled))) goto out; if (unlikely(length > BUF_MAX_DATA_SIZE)) goto out; if (unlikely(trace_recursive_lock(cpu_buffer))) goto out; event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) goto out_unlock; return event; out_unlock: trace_recursive_unlock(cpu_buffer); out: preempt_enable_notrace(); return NULL; } EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); /* * Decrement the entries to the page that an event is on. * The event does not even need to exist, only the pointer * to the page it is on. This may only be called before the commit * takes place. */ static inline void rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { unsigned long addr = (unsigned long)event; struct buffer_page *bpage = cpu_buffer->commit_page; struct buffer_page *start; addr &= PAGE_MASK; /* Do the likely case first */ if (likely(bpage->page == (void *)addr)) { local_dec(&bpage->entries); return; } /* * Because the commit page may be on the reader page we * start with the next page and check the end loop there. */ rb_inc_page(&bpage); start = bpage; do { if (bpage->page == (void *)addr) { local_dec(&bpage->entries); return; } rb_inc_page(&bpage); } while (bpage != start); /* commit not part of this buffer?? */ RB_WARN_ON(cpu_buffer, 1); } /** * ring_buffer_discard_commit - discard an event that has not been committed * @buffer: the ring buffer * @event: non committed event to discard * * Sometimes an event that is in the ring buffer needs to be ignored. * This function lets the user discard an event in the ring buffer * and then that event will not be read later. * * This function only works if it is called before the item has been * committed. It will try to free the event from the ring buffer * if another event has not been added behind it. * * If another event has been added behind it, it will set the event * up as discarded, and perform the commit. * * If this function is called, do not call ring_buffer_unlock_commit on * the event. */ void ring_buffer_discard_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) { struct ring_buffer_per_cpu *cpu_buffer; int cpu; if (unlikely(has_ext_writer(buffer))) return; /* The event is discarded regardless */ rb_event_discard(event); cpu = smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; /* * This must only be called if the event has not been * committed yet. Thus we can assume that preemption * is still disabled. */ RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); rb_decrement_entry(cpu_buffer, event); if (rb_try_to_discard(cpu_buffer, event)) goto out; out: rb_end_commit(cpu_buffer); trace_recursive_unlock(cpu_buffer); preempt_enable_notrace(); } EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); /** * ring_buffer_write - write data to the buffer without reserving * @buffer: The ring buffer to write to. * @length: The length of the data being written (excluding the event header) * @data: The data to write to the buffer. * * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as * one function. If you already have the data to write to the buffer, it * may be easier to simply call this function. * * Note, like ring_buffer_lock_reserve, the length is the length of the data * and not the length of the event which would hold the header. */ int ring_buffer_write(struct trace_buffer *buffer, unsigned long length, void *data) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; void *body; int ret = -EBUSY; int cpu; preempt_disable_notrace(); if (atomic_read(&buffer->record_disabled)) goto out; cpu = raw_smp_processor_id(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) goto out; cpu_buffer = buffer->buffers[cpu]; if (atomic_read(&cpu_buffer->record_disabled)) goto out; if (length > BUF_MAX_DATA_SIZE) goto out; if (unlikely(trace_recursive_lock(cpu_buffer))) goto out; event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) goto out_unlock; body = rb_event_data(event); memcpy(body, data, length); rb_commit(cpu_buffer, event); rb_wakeups(buffer, cpu_buffer); ret = 0; out_unlock: trace_recursive_unlock(cpu_buffer); out: preempt_enable_notrace(); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_write); static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = cpu_buffer->reader_page; struct buffer_page *head = rb_set_head_page(cpu_buffer); struct buffer_page *commit = cpu_buffer->commit_page; /* In case of error, head will be NULL */ if (unlikely(!head)) return true; /* Reader should exhaust content in reader page */ if (reader->read != rb_page_commit(reader)) return false; /* * If writers are committing on the reader page, knowing all * committed content has been read, the ring buffer is empty. */ if (commit == reader) return true; /* * If writers are committing on a page other than reader page * and head page, there should always be content to read. */ if (commit != head) return false; /* * Writers are committing on the head page, we just need * to care about there're committed data, and the reader will * swap reader page with head page when it is to read data. */ return rb_page_commit(commit) == 0; } /** * ring_buffer_record_disable - stop all writes into the buffer * @buffer: The ring buffer to stop writes to. * * This prevents all writes to the buffer. Any attempt to write * to the buffer after this will fail and return NULL. * * The caller should call synchronize_rcu() after this. */ void ring_buffer_record_disable(struct trace_buffer *buffer) { atomic_inc(&buffer->record_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_record_disable); /** * ring_buffer_record_enable - enable writes to the buffer * @buffer: The ring buffer to enable writes * * Note, multiple disables will need the same number of enables * to truly enable the writing (much like preempt_disable). */ void ring_buffer_record_enable(struct trace_buffer *buffer) { if (unlikely(has_ext_writer(buffer))) return; atomic_dec(&buffer->record_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_record_enable); /** * ring_buffer_record_off - stop all writes into the buffer * @buffer: The ring buffer to stop writes to. * * This prevents all writes to the buffer. Any attempt to write * to the buffer after this will fail and return NULL. * * This is different than ring_buffer_record_disable() as * it works like an on/off switch, where as the disable() version * must be paired with a enable(). */ void ring_buffer_record_off(struct trace_buffer *buffer) { unsigned int rd; unsigned int new_rd; do { rd = atomic_read(&buffer->record_disabled); new_rd = rd | RB_BUFFER_OFF; } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd); } EXPORT_SYMBOL_GPL(ring_buffer_record_off); /** * ring_buffer_record_on - restart writes into the buffer * @buffer: The ring buffer to start writes to. * * This enables all writes to the buffer that was disabled by * ring_buffer_record_off(). * * This is different than ring_buffer_record_enable() as * it works like an on/off switch, where as the enable() version * must be paired with a disable(). */ void ring_buffer_record_on(struct trace_buffer *buffer) { unsigned int rd; unsigned int new_rd; if (unlikely(has_ext_writer(buffer))) return; do { rd = atomic_read(&buffer->record_disabled); new_rd = rd & ~RB_BUFFER_OFF; } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd); } EXPORT_SYMBOL_GPL(ring_buffer_record_on); /** * ring_buffer_record_is_on - return true if the ring buffer can write * @buffer: The ring buffer to see if write is enabled * * Returns true if the ring buffer is in a state that it accepts writes. */ bool ring_buffer_record_is_on(struct trace_buffer *buffer) { return !atomic_read(&buffer->record_disabled); } /** * ring_buffer_record_is_set_on - return true if the ring buffer is set writable * @buffer: The ring buffer to see if write is set enabled * * Returns true if the ring buffer is set writable by ring_buffer_record_on(). * Note that this does NOT mean it is in a writable state. * * It may return true when the ring buffer has been disabled by * ring_buffer_record_disable(), as that is a temporary disabling of * the ring buffer. */ bool ring_buffer_record_is_set_on(struct trace_buffer *buffer) { return !(atomic_read(&buffer->record_disabled) & RB_BUFFER_OFF); } /** * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer * @buffer: The ring buffer to stop writes to. * @cpu: The CPU buffer to stop * * This prevents all writes to the buffer. Any attempt to write * to the buffer after this will fail and return NULL. * * The caller should call synchronize_rcu() after this. */ void ring_buffer_record_disable_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; cpu_buffer = buffer->buffers[cpu]; atomic_inc(&cpu_buffer->record_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu); /** * ring_buffer_record_enable_cpu - enable writes to the buffer * @buffer: The ring buffer to enable writes * @cpu: The CPU to enable. * * Note, multiple disables will need the same number of enables * to truly enable the writing (much like preempt_disable). */ void ring_buffer_record_enable_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; cpu_buffer = buffer->buffers[cpu]; atomic_dec(&cpu_buffer->record_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); /* * The total entries in the ring buffer is the running counter * of entries entered into the ring buffer, minus the sum of * the entries read from the ring buffer and the number of * entries that were overwritten. */ static inline unsigned long rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) { return local_read(&cpu_buffer->entries) - (local_read(&cpu_buffer->overrun) + cpu_buffer->read); } /** * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer * @buffer: The ring buffer * @cpu: The per CPU buffer to read from. */ u64 ring_buffer_oldest_event_ts(struct trace_buffer *buffer, int cpu) { unsigned long flags; struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *bpage; u64 ret = 0; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* * if the tail is on reader_page, oldest time stamp is on the reader * page */ if (cpu_buffer->tail_page == cpu_buffer->reader_page) bpage = cpu_buffer->reader_page; else bpage = rb_set_head_page(cpu_buffer); if (bpage) ret = bpage->page->time_stamp; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts); /** * ring_buffer_bytes_cpu - get the number of bytes unconsumed in a cpu buffer * @buffer: The ring buffer * @cpu: The per CPU buffer to read from. */ unsigned long ring_buffer_bytes_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes; return ret; } EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu); /** * ring_buffer_entries_cpu - get the number of entries in a cpu buffer * @buffer: The ring buffer * @cpu: The per CPU buffer to get the entries from. */ unsigned long ring_buffer_entries_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; return rb_num_of_entries(cpu_buffer); } EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); /** * ring_buffer_overrun_cpu - get the number of overruns caused by the ring * buffer wrapping around (only if RB_FL_OVERWRITE is on). * @buffer: The ring buffer * @cpu: The per CPU buffer to get the number of overruns from */ unsigned long ring_buffer_overrun_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; ret = local_read(&cpu_buffer->overrun); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); /** * ring_buffer_commit_overrun_cpu - get the number of overruns caused by * commits failing due to the buffer wrapping around while there are uncommitted * events, such as during an interrupt storm. * @buffer: The ring buffer * @cpu: The per CPU buffer to get the number of overruns from */ unsigned long ring_buffer_commit_overrun_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; ret = local_read(&cpu_buffer->commit_overrun); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); /** * ring_buffer_dropped_events_cpu - get the number of dropped events caused by * the ring buffer filling up (only if RB_FL_OVERWRITE is off). * @buffer: The ring buffer * @cpu: The per CPU buffer to get the number of overruns from */ unsigned long ring_buffer_dropped_events_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; ret = local_read(&cpu_buffer->dropped_events); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu); /** * ring_buffer_read_events_cpu - get the number of events successfully read * @buffer: The ring buffer * @cpu: The per CPU buffer to get the number of events read */ unsigned long ring_buffer_read_events_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; return cpu_buffer->read; } EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu); /** * ring_buffer_entries - get the number of entries in a buffer * @buffer: The ring buffer * * Returns the total number of entries in the ring buffer * (all CPU entries) */ unsigned long ring_buffer_entries(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long entries = 0; int cpu; /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; entries += rb_num_of_entries(cpu_buffer); } return entries; } EXPORT_SYMBOL_GPL(ring_buffer_entries); /** * ring_buffer_overruns - get the number of overruns in buffer * @buffer: The ring buffer * * Returns the total number of overruns in the ring buffer * (all CPU entries) */ unsigned long ring_buffer_overruns(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long overruns = 0; int cpu; /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; overruns += local_read(&cpu_buffer->overrun); } return overruns; } EXPORT_SYMBOL_GPL(ring_buffer_overruns); static void rb_iter_reset(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; /* Iterator usage is expected to have record disabled */ iter->head_page = cpu_buffer->reader_page; iter->head = cpu_buffer->reader_page->read; iter->next_event = iter->head; iter->cache_reader_page = iter->head_page; iter->cache_read = cpu_buffer->read; if (iter->head) { iter->read_stamp = cpu_buffer->read_stamp; iter->page_stamp = cpu_buffer->reader_page->page->time_stamp; } else { iter->read_stamp = iter->head_page->page->time_stamp; iter->page_stamp = iter->read_stamp; } } /** * ring_buffer_iter_reset - reset an iterator * @iter: The iterator to reset * * Resets the iterator, so that it will start from the beginning * again. */ void ring_buffer_iter_reset(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; if (!iter) return; cpu_buffer = iter->cpu_buffer; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_iter_reset(iter); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); /** * ring_buffer_iter_empty - check if an iterator has no more to read * @iter: The iterator to check */ int ring_buffer_iter_empty(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *reader; struct buffer_page *head_page; struct buffer_page *commit_page; struct buffer_page *curr_commit_page; unsigned commit; u64 curr_commit_ts; u64 commit_ts; cpu_buffer = iter->cpu_buffer; reader = cpu_buffer->reader_page; head_page = cpu_buffer->head_page; commit_page = READ_ONCE(cpu_buffer->commit_page); commit_ts = commit_page->page->time_stamp; /* * When the writer goes across pages, it issues a cmpxchg which * is a mb(), which will synchronize with the rmb here. * (see rb_tail_page_update()) */ smp_rmb(); commit = rb_page_commit(commit_page); /* We want to make sure that the commit page doesn't change */ smp_rmb(); /* Make sure commit page didn't change */ curr_commit_page = READ_ONCE(cpu_buffer->commit_page); curr_commit_ts = READ_ONCE(curr_commit_page->page->time_stamp); /* If the commit page changed, then there's more data */ if (curr_commit_page != commit_page || curr_commit_ts != commit_ts) return 0; /* Still racy, as it may return a false positive, but that's OK */ return ((iter->head_page == commit_page && iter->head >= commit) || (iter->head_page == reader && commit_page == head_page && head_page->read == commit && iter->head == rb_page_commit(cpu_buffer->reader_page))); } EXPORT_SYMBOL_GPL(ring_buffer_iter_empty); static void rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { u64 delta; switch (event->type_len) { case RINGBUF_TYPE_PADDING: return; case RINGBUF_TYPE_TIME_EXTEND: delta = rb_event_time_stamp(event); cpu_buffer->read_stamp += delta; return; case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); delta = rb_fix_abs_ts(delta, cpu_buffer->read_stamp); cpu_buffer->read_stamp = delta; return; case RINGBUF_TYPE_DATA: cpu_buffer->read_stamp += event->time_delta; return; default: RB_WARN_ON(cpu_buffer, 1); } return; } static void rb_update_iter_read_stamp(struct ring_buffer_iter *iter, struct ring_buffer_event *event) { u64 delta; switch (event->type_len) { case RINGBUF_TYPE_PADDING: return; case RINGBUF_TYPE_TIME_EXTEND: delta = rb_event_time_stamp(event); iter->read_stamp += delta; return; case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); delta = rb_fix_abs_ts(delta, iter->read_stamp); iter->read_stamp = delta; return; case RINGBUF_TYPE_DATA: iter->read_stamp += event->time_delta; return; default: RB_WARN_ON(iter->cpu_buffer, 1); } return; } static void __set_head_page_flag(struct buffer_page *head, int flag) { struct list_head *prev = head->list.prev; prev->next = (struct list_head *)(((unsigned long)prev->next & ~RB_FLAG_MASK) | flag); } static int __read_footer_reader_status(struct buffer_page *bpage) { struct rb_ext_page_footer *footer = rb_ext_page_get_footer(bpage->page); return atomic_read(&footer->reader_status); } static int __read_footer_writer_status(struct buffer_page *bpage) { struct rb_ext_page_footer *footer = rb_ext_page_get_footer(bpage->page); return atomic_read(&footer->writer_status); } static struct buffer_page * ring_buffer_search_footer(struct buffer_page *start, unsigned long flag) { bool search_writer = flag == RB_PAGE_FT_COMMIT; struct buffer_page *bpage = start; unsigned long status; int cnt = 0; again: do { status = search_writer ? __read_footer_writer_status(bpage) : __read_footer_reader_status(bpage); if (flag & status) return bpage; rb_inc_page(&bpage); } while (bpage != start); /* * There's a chance the writer is in the middle of moving the flag and * we might not find anything after a first round. Let's try again. */ if (cnt++ < 3) goto again; return NULL; } static struct buffer_page * noinline rb_swap_reader_page_ext(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *new_reader, *new_rb_page, *new_head; struct rb_ext_page_footer *footer; unsigned long overrun; if (cpu_buffer->buffer->ext_cb->swap_reader(cpu_buffer->cpu)) { WARN_ON(1); return NULL; } new_rb_page = cpu_buffer->reader_page; /* * Find what page is the new reader... starting with the latest known * head. */ new_reader = ring_buffer_search_footer(cpu_buffer->head_page, RB_PAGE_FT_READER); if (!new_reader) { WARN_ON(1); return NULL; } /* ... and install it into the ring buffer in place of the old head */ rb_list_head_clear(&new_reader->list); new_rb_page->list.next = new_reader->list.next; new_rb_page->list.prev = new_reader->list.prev; new_rb_page->list.next->prev = &new_rb_page->list; new_rb_page->list.prev->next = &new_rb_page->list; cpu_buffer->reader_page = new_reader; cpu_buffer->reader_page->read = 0; /* Install the new head page */ new_head = new_rb_page; rb_inc_page(&new_head); cpu_buffer->head_page = new_head; /* * cpu_buffer->pages just needs to point to the buffer, it * has no specific buffer page to point to. Lets move it out * of our way so we don't accidentally swap it. */ cpu_buffer->pages = &new_head->list; __set_head_page_flag(new_head, RB_PAGE_HEAD); footer = rb_ext_page_get_footer(new_reader->page); overrun = footer->stats.overrun; if (overrun != cpu_buffer->last_overrun) { cpu_buffer->lost_events = overrun - cpu_buffer->last_overrun; cpu_buffer->last_overrun = overrun; } return new_reader; } static struct buffer_page * rb_swap_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader; unsigned long overwrite; int ret; /* * Reset the reader page to size zero. */ local_set(&cpu_buffer->reader_page->write, 0); local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->real_end = 0; spin: /* * Splice the empty reader page into the list around the head. */ reader = rb_set_head_page(cpu_buffer); if (!reader) return NULL; cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); cpu_buffer->reader_page->list.prev = reader->list.prev; /* * cpu_buffer->pages just needs to point to the buffer, it * has no specific buffer page to point to. Lets move it out * of our way so we don't accidentally swap it. */ cpu_buffer->pages = reader->list.prev; /* The reader page will be pointing to the new head */ rb_set_list_to_head(&cpu_buffer->reader_page->list); /* * We want to make sure we read the overruns after we set up our * pointers to the next object. The writer side does a * cmpxchg to cross pages which acts as the mb on the writer * side. Note, the reader will constantly fail the swap * while the writer is updating the pointers, so this * guarantees that the overwrite recorded here is the one we * want to compare with the last_overrun. */ smp_mb(); overwrite = local_read(&(cpu_buffer->overrun)); /* * Here's the tricky part. * * We need to move the pointer past the header page. * But we can only do that if a writer is not currently * moving it. The page before the header page has the * flag bit '1' set if it is pointing to the page we want. * but if the writer is in the process of moving it * than it will be '2' or already moved '0'. */ ret = rb_head_page_replace(reader, cpu_buffer->reader_page); /* * If we did not convert it, then we must try again. */ if (!ret) goto spin; /* * Yay! We succeeded in replacing the page. * * Now make the new head point back to the reader page. */ rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list; rb_inc_page(&cpu_buffer->head_page); local_inc(&cpu_buffer->pages_read); /* Finally update the reader page to the new head */ cpu_buffer->reader_page = reader; cpu_buffer->reader_page->read = 0; if (overwrite != cpu_buffer->last_overrun) { cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun; cpu_buffer->last_overrun = overwrite; } return reader; } static struct buffer_page * rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = NULL; unsigned long flags; int nr_loops = 0; unsigned int page_size; local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); again: /* * This should normally only loop twice. But because the * start of the reader inserts an empty page, it causes * a case where we will loop three times. There should be no * reason to loop four times (that I know of). */ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) { reader = NULL; goto out; } reader = cpu_buffer->reader_page; /* If there's more to read, return this page */ if (cpu_buffer->reader_page->read < rb_page_size(reader)) goto out; page_size = rb_page_size(reader); /* Never should we have an index greater than the size */ if (RB_WARN_ON(cpu_buffer, cpu_buffer->reader_page->read > page_size)) goto out; /* check if we caught up to the tail */ reader = NULL; if (cpu_buffer->commit_page == cpu_buffer->reader_page) goto out; /* Don't bother swapping if the ring buffer is empty */ if (rb_num_of_entries(cpu_buffer) == 0) goto out; if (rb_has_ext_writer(cpu_buffer)) reader = rb_swap_reader_page_ext(cpu_buffer); else reader = rb_swap_reader_page(cpu_buffer); if (reader) goto again; out: /* Update the read_stamp on the first event */ if (reader && reader->read == 0) cpu_buffer->read_stamp = reader->page->time_stamp; arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); /* * The writer has preempt disable, wait for it. But not forever * Although, 1 second is pretty much "forever" */ #define USECS_WAIT 1000000 for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) { /* If the write is past the end of page, a writer is still updating it */ if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE)) break; udelay(1); /* Get the latest version of the reader write value */ smp_rmb(); } /* The writer is not moving forward? Something is wrong */ if (RB_WARN_ON(cpu_buffer, nr_loops == USECS_WAIT)) reader = NULL; /* * Make sure we see any padding after the write update * (see rb_reset_tail()). * * In addition, a writer may be writing on the reader page * if the page has not been fully filled, so the read barrier * is also needed to make sure we see the content of what is * committed by the writer (see rb_set_commit_to_write()). */ smp_rmb(); return reader; } static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) { struct ring_buffer_event *event; struct buffer_page *reader; unsigned length; reader = rb_get_reader_page(cpu_buffer); /* This function should not be called when buffer is empty */ if (RB_WARN_ON(cpu_buffer, !reader)) return; event = rb_reader_event(cpu_buffer); if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX) cpu_buffer->read++; rb_update_read_stamp(cpu_buffer, event); length = rb_event_length(event); cpu_buffer->reader_page->read += length; cpu_buffer->read_bytes += length; } static void rb_advance_iter(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer; cpu_buffer = iter->cpu_buffer; /* If head == next_event then we need to jump to the next event */ if (iter->head == iter->next_event) { /* If the event gets overwritten again, there's nothing to do */ if (rb_iter_head_event(iter) == NULL) return; } iter->head = iter->next_event; /* * Check if we are at the end of the buffer. */ if (iter->next_event >= rb_page_size(iter->head_page)) { /* discarded commits can make the page empty */ if (iter->head_page == cpu_buffer->commit_page) return; rb_inc_iter(iter); return; } rb_update_iter_read_stamp(iter, iter->event); } static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) { return cpu_buffer->lost_events; } static struct ring_buffer_event * rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, unsigned long *lost_events) { struct ring_buffer_event *event; struct buffer_page *reader; int nr_loops = 0; if (ts) *ts = 0; again: /* * We repeat when a time extend is encountered. * Since the time extend is always attached to a data event, * we should never loop more than once. * (We never hit the following condition more than twice). */ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) return NULL; reader = rb_get_reader_page(cpu_buffer); if (!reader) return NULL; event = rb_reader_event(cpu_buffer); switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) RB_WARN_ON(cpu_buffer, 1); /* * Because the writer could be discarding every * event it creates (which would probably be bad) * if we were to go back to "again" then we may never * catch up, and will trigger the warn on, or lock * the box. Return the padding, and we will release * the current locks, and try again. */ return event; case RINGBUF_TYPE_TIME_EXTEND: /* Internal data, OK to advance */ rb_advance_reader(cpu_buffer); goto again; case RINGBUF_TYPE_TIME_STAMP: if (ts) { *ts = rb_event_time_stamp(event); *ts = rb_fix_abs_ts(*ts, reader->page->time_stamp); ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } /* Internal data, OK to advance */ rb_advance_reader(cpu_buffer); goto again; case RINGBUF_TYPE_DATA: if (ts && !(*ts)) { *ts = cpu_buffer->read_stamp + event->time_delta; ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } if (lost_events) *lost_events = rb_lost_events(cpu_buffer); return event; default: RB_WARN_ON(cpu_buffer, 1); } return NULL; } EXPORT_SYMBOL_GPL(ring_buffer_peek); static struct ring_buffer_event * rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) { struct trace_buffer *buffer; struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; int nr_loops = 0; if (ts) *ts = 0; cpu_buffer = iter->cpu_buffer; buffer = cpu_buffer->buffer; /* * Check if someone performed a consuming read to * the buffer. A consuming read invalidates the iterator * and we need to reset the iterator in this case. */ if (unlikely(iter->cache_read != cpu_buffer->read || iter->cache_reader_page != cpu_buffer->reader_page)) rb_iter_reset(iter); again: if (ring_buffer_iter_empty(iter)) return NULL; /* * As the writer can mess with what the iterator is trying * to read, just give up if we fail to get an event after * three tries. The iterator is not as reliable when reading * the ring buffer with an active write as the consumer is. * Do not warn if the three failures is reached. */ if (++nr_loops > 3) return NULL; if (rb_per_cpu_empty(cpu_buffer)) return NULL; if (iter->head >= rb_page_size(iter->head_page)) { rb_inc_iter(iter); goto again; } event = rb_iter_head_event(iter); if (!event) goto again; switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) { rb_inc_iter(iter); goto again; } rb_advance_iter(iter); return event; case RINGBUF_TYPE_TIME_EXTEND: /* Internal data, OK to advance */ rb_advance_iter(iter); goto again; case RINGBUF_TYPE_TIME_STAMP: if (ts) { *ts = rb_event_time_stamp(event); *ts = rb_fix_abs_ts(*ts, iter->head_page->page->time_stamp); ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } /* Internal data, OK to advance */ rb_advance_iter(iter); goto again; case RINGBUF_TYPE_DATA: if (ts && !(*ts)) { *ts = iter->read_stamp + event->time_delta; ring_buffer_normalize_time_stamp(buffer, cpu_buffer->cpu, ts); } return event; default: RB_WARN_ON(cpu_buffer, 1); } return NULL; } EXPORT_SYMBOL_GPL(ring_buffer_iter_peek); static inline bool rb_reader_lock(struct ring_buffer_per_cpu *cpu_buffer) { if (likely(!in_nmi())) { raw_spin_lock(&cpu_buffer->reader_lock); return true; } /* * If an NMI die dumps out the content of the ring buffer * trylock must be used to prevent a deadlock if the NMI * preempted a task that holds the ring buffer locks. If * we get the lock then all is fine, if not, then continue * to do the read, but this can corrupt the ring buffer, * so it must be permanently disabled from future writes. * Reading from NMI is a oneshot deal. */ if (raw_spin_trylock(&cpu_buffer->reader_lock)) return true; /* Continue without locking, but disable the ring buffer */ atomic_inc(&cpu_buffer->record_disabled); return false; } static inline void rb_reader_unlock(struct ring_buffer_per_cpu *cpu_buffer, bool locked) { if (likely(locked)) raw_spin_unlock(&cpu_buffer->reader_lock); return; } /** * ring_buffer_peek - peek at the next event to be read * @buffer: The ring buffer to read * @cpu: The cpu to peak at * @ts: The timestamp counter of this event. * @lost_events: a variable to store if events were lost (may be NULL) * * This will return the event that will be read next, but does * not consume the data. */ struct ring_buffer_event * ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts, unsigned long *lost_events) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; unsigned long flags; bool dolock; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; again: local_irq_save(flags); dolock = rb_reader_lock(cpu_buffer); event = rb_buffer_peek(cpu_buffer, ts, lost_events); if (event && event->type_len == RINGBUF_TYPE_PADDING) rb_advance_reader(cpu_buffer); rb_reader_unlock(cpu_buffer, dolock); local_irq_restore(flags); if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; return event; } /** ring_buffer_iter_dropped - report if there are dropped events * @iter: The ring buffer iterator * * Returns true if there was dropped events since the last peek. */ bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter) { bool ret = iter->missed_events != 0; iter->missed_events = 0; return ret; } EXPORT_SYMBOL_GPL(ring_buffer_iter_dropped); /** * ring_buffer_iter_peek - peek at the next event to be read * @iter: The ring buffer iterator * @ts: The timestamp counter of this event. * * This will return the event that will be read next, but does * not increment the iterator. */ struct ring_buffer_event * ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; struct ring_buffer_event *event; unsigned long flags; again: raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_iter_peek(iter, ts); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; return event; } /** * ring_buffer_consume - return an event and consume it * @buffer: The ring buffer to get the next event from * @cpu: the cpu to read the buffer from * @ts: a variable to store the timestamp (may be NULL) * @lost_events: a variable to store if events were lost (may be NULL) * * Returns the next event in the ring buffer, and that event is consumed. * Meaning, that sequential reads will keep returning a different event, * and eventually empty the ring buffer if the producer is slower. */ struct ring_buffer_event * ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts, unsigned long *lost_events) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event = NULL; unsigned long flags; bool dolock; again: /* might be called in atomic */ preempt_disable(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) goto out; cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); dolock = rb_reader_lock(cpu_buffer); event = rb_buffer_peek(cpu_buffer, ts, lost_events); if (event) { cpu_buffer->lost_events = 0; rb_advance_reader(cpu_buffer); } rb_reader_unlock(cpu_buffer, dolock); local_irq_restore(flags); out: preempt_enable(); if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; return event; } EXPORT_SYMBOL_GPL(ring_buffer_consume); static void ring_buffer_update_view(struct ring_buffer_per_cpu *cpu_buffer) { struct rb_ext_page_footer *footer; struct buffer_page *bpage; if (!rb_has_ext_writer(cpu_buffer)) return; raw_spin_lock_irq(&cpu_buffer->reader_lock); arch_spin_lock(&cpu_buffer->lock); cpu_buffer->buffer->ext_cb->update_footers(cpu_buffer->cpu); bpage = cpu_buffer->reader_page; footer = rb_ext_page_get_footer(bpage->page); local_set(&cpu_buffer->entries, footer->stats.entries); local_set(&cpu_buffer->pages_touched, footer->stats.pages_touched); local_set(&cpu_buffer->overrun, footer->stats.overrun); /* Update the commit page */ bpage = ring_buffer_search_footer(cpu_buffer->commit_page, RB_PAGE_FT_COMMIT); if (!bpage) { WARN_ON(1); goto unlock; } cpu_buffer->commit_page = bpage; /* Update the head page */ bpage = ring_buffer_search_footer(cpu_buffer->head_page, RB_PAGE_FT_HEAD); if (!bpage) { WARN_ON(1); goto unlock; } /* Reset the previous RB_PAGE_HEAD flag */ __set_head_page_flag(cpu_buffer->head_page, RB_PAGE_NORMAL); /* Set RB_PAGE_HEAD flag pointing to the new head */ __set_head_page_flag(bpage, RB_PAGE_HEAD); cpu_buffer->reader_page->list.next = &cpu_buffer->head_page->list; cpu_buffer->head_page = bpage; unlock: arch_spin_unlock(&cpu_buffer->lock); raw_spin_unlock_irq(&cpu_buffer->reader_lock); } int ring_buffer_poke(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; ring_buffer_update_view(cpu_buffer); rb_wakeups(buffer, cpu_buffer); return 0; } /** * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer * @buffer: The ring buffer to read from * @cpu: The cpu buffer to iterate over * @flags: gfp flags to use for memory allocation * * This performs the initial preparations necessary to iterate * through the buffer. Memory is allocated, buffer recording * is disabled, and the iterator pointer is returned to the caller. * * Disabling buffer recording prevents the reading from being * corrupted. This is not a consuming read, so a producer is not * expected. * * After a sequence of ring_buffer_read_prepare calls, the user is * expected to make at least one call to ring_buffer_read_prepare_sync. * Afterwards, ring_buffer_read_start is invoked to get things going * for real. * * This overall must be paired with ring_buffer_read_finish. */ struct ring_buffer_iter * ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_iter *iter; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; iter = kzalloc(sizeof(*iter), flags); if (!iter) return NULL; /* Holds the entire event: data and meta data */ iter->event = kmalloc(BUF_PAGE_SIZE, flags); if (!iter->event) { kfree(iter); return NULL; } cpu_buffer = buffer->buffers[cpu]; iter->cpu_buffer = cpu_buffer; atomic_inc(&cpu_buffer->resize_disabled); ring_buffer_update_view(cpu_buffer); return iter; } EXPORT_SYMBOL_GPL(ring_buffer_read_prepare); /** * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls * * All previously invoked ring_buffer_read_prepare calls to prepare * iterators will be synchronized. Afterwards, read_buffer_read_start * calls on those iterators are allowed. */ void ring_buffer_read_prepare_sync(void) { synchronize_rcu(); } EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync); /** * ring_buffer_read_start - start a non consuming read of the buffer * @iter: The iterator returned by ring_buffer_read_prepare * * This finalizes the startup of an iteration through the buffer. * The iterator comes from a call to ring_buffer_read_prepare and * an intervening ring_buffer_read_prepare_sync must have been * performed. * * Must be paired with ring_buffer_read_finish. */ void ring_buffer_read_start(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; if (!iter) return; cpu_buffer = iter->cpu_buffer; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); arch_spin_lock(&cpu_buffer->lock); rb_iter_reset(iter); arch_spin_unlock(&cpu_buffer->lock); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } EXPORT_SYMBOL_GPL(ring_buffer_read_start); /** * ring_buffer_read_finish - finish reading the iterator of the buffer * @iter: The iterator retrieved by ring_buffer_start * * This re-enables the recording to the buffer, and frees the * iterator. */ void ring_buffer_read_finish(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; unsigned long flags; /* * Ring buffer is disabled from recording, here's a good place * to check the integrity of the ring buffer. * Must prevent readers from trying to read, as the check * clears the HEAD page and readers require it. */ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_check_pages(cpu_buffer); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); atomic_dec(&cpu_buffer->resize_disabled); kfree(iter->event); kfree(iter); } EXPORT_SYMBOL_GPL(ring_buffer_read_finish); /** * ring_buffer_iter_advance - advance the iterator to the next location * @iter: The ring buffer iterator * * Move the location of the iterator such that the next read will * be the next location of the iterator. */ void ring_buffer_iter_advance(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; unsigned long flags; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_advance_iter(iter); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } EXPORT_SYMBOL_GPL(ring_buffer_iter_advance); /** * ring_buffer_size - return the size of the ring buffer (in bytes) * @buffer: The ring buffer. * @cpu: The CPU to get ring buffer size from. */ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu) { /* * Earlier, this method returned * BUF_PAGE_SIZE * buffer->nr_pages * Since the nr_pages field is now removed, we have converted this to * return the per cpu buffer value. */ if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages; } EXPORT_SYMBOL_GPL(ring_buffer_size); static void rb_clear_buffer_page(struct buffer_page *page) { local_set(&page->write, 0); local_set(&page->entries, 0); rb_init_page(page->page); page->read = 0; } static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *page; rb_head_page_deactivate(cpu_buffer); cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); rb_clear_buffer_page(cpu_buffer->head_page); list_for_each_entry(page, cpu_buffer->pages, list) { rb_clear_buffer_page(page); } cpu_buffer->tail_page = cpu_buffer->head_page; cpu_buffer->commit_page = cpu_buffer->head_page; INIT_LIST_HEAD(&cpu_buffer->reader_page->list); INIT_LIST_HEAD(&cpu_buffer->new_pages); rb_clear_buffer_page(cpu_buffer->reader_page); local_set(&cpu_buffer->entries_bytes, 0); local_set(&cpu_buffer->overrun, 0); local_set(&cpu_buffer->commit_overrun, 0); local_set(&cpu_buffer->dropped_events, 0); local_set(&cpu_buffer->entries, 0); local_set(&cpu_buffer->committing, 0); local_set(&cpu_buffer->commits, 0); local_set(&cpu_buffer->pages_touched, 0); local_set(&cpu_buffer->pages_lost, 0); local_set(&cpu_buffer->pages_read, 0); cpu_buffer->last_pages_touch = 0; cpu_buffer->shortest_full = 0; cpu_buffer->read = 0; cpu_buffer->read_bytes = 0; rb_time_set(&cpu_buffer->write_stamp, 0); rb_time_set(&cpu_buffer->before_stamp, 0); memset(cpu_buffer->event_stamp, 0, sizeof(cpu_buffer->event_stamp)); cpu_buffer->lost_events = 0; cpu_buffer->last_overrun = 0; rb_head_page_activate(cpu_buffer); } /* Must have disabled the cpu buffer then done a synchronize_rcu */ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long flags; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) goto out; arch_spin_lock(&cpu_buffer->lock); rb_reset_cpu(cpu_buffer); arch_spin_unlock(&cpu_buffer->lock); out: raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } /** * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer * @buffer: The ring buffer to reset a per cpu buffer of * @cpu: The CPU buffer to be reset */ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); atomic_inc(&cpu_buffer->resize_disabled); atomic_inc(&cpu_buffer->record_disabled); /* Make sure all commits have finished */ synchronize_rcu(); reset_disabled_cpu_buffer(cpu_buffer); atomic_dec(&cpu_buffer->record_disabled); atomic_dec(&cpu_buffer->resize_disabled); mutex_unlock(&buffer->mutex); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); /* Flag to ensure proper resetting of atomic variables */ #define RESET_BIT (1 << 30) /** * ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer * @buffer: The ring buffer to reset a per cpu buffer of * @cpu: The CPU buffer to be reset */ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; int cpu; /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); for_each_online_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; atomic_add(RESET_BIT, &cpu_buffer->resize_disabled); atomic_inc(&cpu_buffer->record_disabled); } /* Make sure all commits have finished */ synchronize_rcu(); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; /* * If a CPU came online during the synchronize_rcu(), then * ignore it. */ if (!(atomic_read(&cpu_buffer->resize_disabled) & RESET_BIT)) continue; reset_disabled_cpu_buffer(cpu_buffer); atomic_dec(&cpu_buffer->record_disabled); atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled); } mutex_unlock(&buffer->mutex); } /** * ring_buffer_reset - reset a ring buffer * @buffer: The ring buffer to reset all cpu buffers */ void ring_buffer_reset(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; int cpu; /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; atomic_inc(&cpu_buffer->resize_disabled); atomic_inc(&cpu_buffer->record_disabled); } /* Make sure all commits have finished */ synchronize_rcu(); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; reset_disabled_cpu_buffer(cpu_buffer); atomic_dec(&cpu_buffer->record_disabled); atomic_dec(&cpu_buffer->resize_disabled); } mutex_unlock(&buffer->mutex); } EXPORT_SYMBOL_GPL(ring_buffer_reset); /** * ring_buffer_empty - is the ring buffer empty? * @buffer: The ring buffer to test */ bool ring_buffer_empty(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; bool dolock; int cpu; int ret; /* yes this is racy, but if you don't like the race, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); dolock = rb_reader_lock(cpu_buffer); ret = rb_per_cpu_empty(cpu_buffer); rb_reader_unlock(cpu_buffer, dolock); local_irq_restore(flags); if (!ret) return false; } return true; } EXPORT_SYMBOL_GPL(ring_buffer_empty); /** * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty? * @buffer: The ring buffer * @cpu: The CPU buffer to test */ bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; bool dolock; int ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return true; cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); dolock = rb_reader_lock(cpu_buffer); ret = rb_per_cpu_empty(cpu_buffer); rb_reader_unlock(cpu_buffer, dolock); local_irq_restore(flags); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP /** * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers * @buffer_a: One buffer to swap with * @buffer_b: The other buffer to swap with * @cpu: the CPU of the buffers to swap * * This function is useful for tracers that want to take a "snapshot" * of a CPU buffer and has another back up buffer lying around. * it is expected that the tracer handles the cpu buffer not being * used at the moment. */ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, struct trace_buffer *buffer_b, int cpu) { struct ring_buffer_per_cpu *cpu_buffer_a; struct ring_buffer_per_cpu *cpu_buffer_b; int ret = -EINVAL; if (unlikely(has_ext_writer(buffer_a) || has_ext_writer(buffer_b))) return -EINVAL; if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || !cpumask_test_cpu(cpu, buffer_b->cpumask)) goto out; cpu_buffer_a = buffer_a->buffers[cpu]; cpu_buffer_b = buffer_b->buffers[cpu]; /* At least make sure the two buffers are somewhat the same */ if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) goto out; ret = -EAGAIN; if (atomic_read(&buffer_a->record_disabled)) goto out; if (atomic_read(&buffer_b->record_disabled)) goto out; if (atomic_read(&cpu_buffer_a->record_disabled)) goto out; if (atomic_read(&cpu_buffer_b->record_disabled)) goto out; /* * We can't do a synchronize_rcu here because this * function can be called in atomic context. * Normally this will be called from the same CPU as cpu. * If not it's up to the caller to protect this. */ atomic_inc(&cpu_buffer_a->record_disabled); atomic_inc(&cpu_buffer_b->record_disabled); ret = -EBUSY; if (local_read(&cpu_buffer_a->committing)) goto out_dec; if (local_read(&cpu_buffer_b->committing)) goto out_dec; /* * When resize is in progress, we cannot swap it because * it will mess the state of the cpu buffer. */ if (atomic_read(&buffer_a->resizing)) goto out_dec; if (atomic_read(&buffer_b->resizing)) goto out_dec; buffer_a->buffers[cpu] = cpu_buffer_b; buffer_b->buffers[cpu] = cpu_buffer_a; cpu_buffer_b->buffer = buffer_a; cpu_buffer_a->buffer = buffer_b; ret = 0; out_dec: atomic_dec(&cpu_buffer_a->record_disabled); atomic_dec(&cpu_buffer_b->record_disabled); out: return ret; } EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); #endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */ /** * ring_buffer_alloc_read_page - allocate a page to read from buffer * @buffer: the buffer to allocate for. * @cpu: the cpu buffer to allocate. * * This function is used in conjunction with ring_buffer_read_page. * When reading a full page from the ring buffer, these functions * can be used to speed up the process. The calling function should * allocate a few pages first with this function. Then when it * needs to get pages from the ring buffer, it passes the result * of this function into ring_buffer_read_page, which will swap * the page that was allocated, with the read page of the buffer. * * Returns: * The page allocated, or ERR_PTR */ void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_data_page *bpage = NULL; unsigned long flags; struct page *page; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return ERR_PTR(-ENODEV); cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); if (cpu_buffer->free_page) { bpage = cpu_buffer->free_page; cpu_buffer->free_page = NULL; } arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); if (bpage) goto out; page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, 0); if (!page) return ERR_PTR(-ENOMEM); bpage = page_address(page); out: rb_init_page(bpage); return bpage; } EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); /** * ring_buffer_free_read_page - free an allocated read page * @buffer: the buffer the page was allocate for * @cpu: the cpu buffer the page came from * @data: the page to free * * Free a page allocated from ring_buffer_alloc_read_page. */ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_data_page *bpage = data; struct page *page = virt_to_page(bpage); unsigned long flags; if (!buffer || !buffer->buffers || !buffer->buffers[cpu]) return; cpu_buffer = buffer->buffers[cpu]; /* If the page is still in use someplace else, we can't reuse it */ if (page_ref_count(page) > 1) goto out; local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); if (!cpu_buffer->free_page) { cpu_buffer->free_page = bpage; bpage = NULL; } arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); out: free_page((unsigned long)bpage); } EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); /** * ring_buffer_read_page - extract a page from the ring buffer * @buffer: buffer to extract from * @data_page: the page to use allocated from ring_buffer_alloc_read_page * @len: amount to extract * @cpu: the cpu of the buffer to extract * @full: should the extraction only happen when the page is full. * * This function will pull out a page from the ring buffer and consume it. * @data_page must be the address of the variable that was returned * from ring_buffer_alloc_read_page. This is because the page might be used * to swap with a page in the ring buffer. * * for example: * rpage = ring_buffer_alloc_read_page(buffer, cpu); * if (IS_ERR(rpage)) * return PTR_ERR(rpage); * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); * if (ret >= 0) * process_page(rpage, ret); * * When @full is set, the function will not return true unless * the writer is off the reader page. * * Note: it is up to the calling functions to handle sleeps and wakeups. * The ring buffer can be used anywhere in the kernel and can not * blindly call wake_up. The layer that uses the ring buffer must be * responsible for that. * * Returns: * >=0 if data has been transferred, returns the offset of consumed data. * <0 if no data has been transferred. */ int ring_buffer_read_page(struct trace_buffer *buffer, void **data_page, size_t len, int cpu, int full) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; struct buffer_data_page *bpage; struct buffer_page *reader; unsigned long missed_events; unsigned long flags; unsigned int commit; unsigned int read; u64 save_timestamp; int ret = -1; if (!cpumask_test_cpu(cpu, buffer->cpumask)) goto out; /* * If len is not big enough to hold the page header, then * we can not copy anything. */ if (len <= BUF_PAGE_HDR_SIZE) goto out; len -= BUF_PAGE_HDR_SIZE; if (!data_page) goto out; bpage = *data_page; if (!bpage) goto out; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); reader = rb_get_reader_page(cpu_buffer); if (!reader) goto out_unlock; event = rb_reader_event(cpu_buffer); read = reader->read; commit = rb_page_commit(reader); /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; /* * If this page has been partially read or * if len is not big enough to read the rest of the page or * a writer is still on the page, then * we must copy the data from the page to the buffer. * Otherwise, we can simply swap the page with the one passed in. */ if (read || (len < (commit - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page || unlikely(has_ext_writer(buffer))) { struct buffer_data_page *rpage = cpu_buffer->reader_page->page; unsigned int rpos = read; unsigned int pos = 0; unsigned int size; /* * If a full page is expected, this can still be returned * if there's been a previous partial read and the * rest of the page can be read and the commit page is off * the reader page. */ if (full && (!read || (len < (commit - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page)) goto out_unlock; if (len > (commit - read)) len = (commit - read); /* Always keep the time extend and data together */ size = rb_event_ts_length(event); if (len < size) goto out_unlock; /* save the current timestamp, since the user will need it */ save_timestamp = cpu_buffer->read_stamp; /* Need to copy one event at a time */ do { /* We need the size of one event, because * rb_advance_reader only advances by one event, * whereas rb_event_ts_length may include the size of * one or two events. * We have already ensured there's enough space if this * is a time extend. */ size = rb_event_length(event); memcpy(bpage->data + pos, rpage->data + rpos, size); len -= size; rb_advance_reader(cpu_buffer); rpos = reader->read; pos += size; if (rpos >= commit) break; event = rb_reader_event(cpu_buffer); /* Always keep the time extend and data together */ size = rb_event_ts_length(event); } while (len >= size); /* update bpage */ local_set(&bpage->commit, pos); bpage->time_stamp = save_timestamp; /* we copied everything to the beginning */ read = 0; } else { /* update the entry counter */ cpu_buffer->read += rb_page_entries(reader); cpu_buffer->read_bytes += rb_page_commit(reader); /* swap the pages */ rb_init_page(bpage); bpage = reader->page; reader->page = *data_page; local_set(&reader->write, 0); local_set(&reader->entries, 0); reader->read = 0; *data_page = bpage; /* * Use the real_end for the data size, * This gives us a chance to store the lost events * on the page. */ if (reader->real_end) local_set(&bpage->commit, reader->real_end); } ret = read; cpu_buffer->lost_events = 0; commit = local_read(&bpage->commit); /* * Set a flag in the commit field if we lost events */ if (missed_events) { /* If there is room at the end of the page to save the * missed events, then record it there. */ if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) { memcpy(&bpage->data[commit], &missed_events, sizeof(missed_events)); local_add(RB_MISSED_STORED, &bpage->commit); commit += sizeof(missed_events); } local_add(RB_MISSED_EVENTS, &bpage->commit); } /* * This page may be off to user land. Zero it out here. */ if (commit < BUF_PAGE_SIZE) memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); out_unlock: raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); out: return ret; } EXPORT_SYMBOL_GPL(ring_buffer_read_page); /* * We only allocate new buffers, never free them if the CPU goes down. * If we were to free the buffer, then the user would lose any trace that was in * the buffer. */ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node) { struct trace_buffer *buffer; long nr_pages_same; int cpu_i; unsigned long nr_pages; buffer = container_of(node, struct trace_buffer, node); if (cpumask_test_cpu(cpu, buffer->cpumask)) return 0; nr_pages = 0; nr_pages_same = 1; /* check if all cpu sizes are same */ for_each_buffer_cpu(buffer, cpu_i) { /* fill in the size from first enabled cpu */ if (nr_pages == 0) nr_pages = buffer->buffers[cpu_i]->nr_pages; if (nr_pages != buffer->buffers[cpu_i]->nr_pages) { nr_pages_same = 0; break; } } /* allocate minimum pages, user can later expand it */ if (!nr_pages_same) nr_pages = 2; buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); if (!buffer->buffers[cpu]) { WARN(1, "failed to allocate ring buffer on CPU %u\n", cpu); return -ENOMEM; } smp_wmb(); cpumask_set_cpu(cpu, buffer->cpumask); return 0; } #define TRACE_BUFFER_PACK_HDR_SIZE offsetof(struct trace_buffer_pack, __data) #define RING_BUFFER_PACK_HDR_SIZE offsetof(struct ring_buffer_pack, page_va) size_t trace_buffer_pack_size(struct trace_buffer *trace_buffer) { size_t size = 0; int cpu; for_each_buffer_cpu(trace_buffer, cpu) { struct ring_buffer_per_cpu *rb = trace_buffer->buffers[cpu]; size += rb->nr_pages * sizeof(unsigned long); size += RING_BUFFER_PACK_HDR_SIZE; } size += TRACE_BUFFER_PACK_HDR_SIZE; return size; } int trace_buffer_pack(struct trace_buffer *trace_buffer, struct trace_buffer_pack *pack) { struct ring_buffer_pack *cpu_pack; int cpu = -1, pack_cpu, j; if (!has_ext_writer(trace_buffer)) return -EINVAL; pack->nr_cpus = cpumask_weight(trace_buffer->cpumask); pack->total_pages = 0; for_each_ring_buffer_pack(cpu_pack, pack_cpu, pack) { struct ring_buffer_per_cpu *rb; unsigned long flags, nr_pages; struct buffer_page *bpage; cpu = cpumask_next(cpu, trace_buffer->cpumask); if (cpu > nr_cpu_ids) { WARN_ON(1); break; } rb = trace_buffer->buffers[cpu]; local_irq_save(flags); arch_spin_lock(&rb->lock); bpage = rb->head_page; nr_pages = rb->nr_pages; pack->total_pages += nr_pages + 1; cpu_pack->cpu = cpu; cpu_pack->reader_page_va = (unsigned long)rb->reader_page->page; cpu_pack->nr_pages = nr_pages; for (j = 0; j < nr_pages; j++) { cpu_pack->page_va[j] = (unsigned long)bpage->page; rb_inc_page(&bpage); } arch_spin_unlock(&rb->lock); local_irq_restore(flags); } return 0; } #ifdef CONFIG_RING_BUFFER_STARTUP_TEST /* * This is a basic integrity check of the ring buffer. * Late in the boot cycle this test will run when configured in. * It will kick off a thread per CPU that will go into a loop * writing to the per cpu ring buffer various sizes of data. * Some of the data will be large items, some small. * * Another thread is created that goes into a spin, sending out * IPIs to the other CPUs to also write into the ring buffer. * this is to test the nesting ability of the buffer. * * Basic stats are recorded and reported. If something in the * ring buffer should happen that's not expected, a big warning * is displayed and all ring buffers are disabled. */ static struct task_struct *rb_threads[NR_CPUS] __initdata; struct rb_test_data { struct trace_buffer *buffer; unsigned long events; unsigned long bytes_written; unsigned long bytes_alloc; unsigned long bytes_dropped; unsigned long events_nested; unsigned long bytes_written_nested; unsigned long bytes_alloc_nested; unsigned long bytes_dropped_nested; int min_size_nested; int max_size_nested; int max_size; int min_size; int cpu; int cnt; }; static struct rb_test_data rb_data[NR_CPUS] __initdata; /* 1 meg per cpu */ #define RB_TEST_BUFFER_SIZE 1048576 static char rb_string[] __initdata = "abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\" "?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890" "!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv"; static bool rb_test_started __initdata; struct rb_item { int size; char str[]; }; static __init int rb_write_something(struct rb_test_data *data, bool nested) { struct ring_buffer_event *event; struct rb_item *item; bool started; int event_len; int size; int len; int cnt; /* Have nested writes different that what is written */ cnt = data->cnt + (nested ? 27 : 0); /* Multiply cnt by ~e, to make some unique increment */ size = (cnt * 68 / 25) % (sizeof(rb_string) - 1); len = size + sizeof(struct rb_item); started = rb_test_started; /* read rb_test_started before checking buffer enabled */ smp_rmb(); event = ring_buffer_lock_reserve(data->buffer, len); if (!event) { /* Ignore dropped events before test starts. */ if (started) { if (nested) data->bytes_dropped += len; else data->bytes_dropped_nested += len; } return len; } event_len = ring_buffer_event_length(event); if (RB_WARN_ON(data->buffer, event_len < len)) goto out; item = ring_buffer_event_data(event); item->size = size; memcpy(item->str, rb_string, size); if (nested) { data->bytes_alloc_nested += event_len; data->bytes_written_nested += len; data->events_nested++; if (!data->min_size_nested || len < data->min_size_nested) data->min_size_nested = len; if (len > data->max_size_nested) data->max_size_nested = len; } else { data->bytes_alloc += event_len; data->bytes_written += len; data->events++; if (!data->min_size || len < data->min_size) data->max_size = len; if (len > data->max_size) data->max_size = len; } out: ring_buffer_unlock_commit(data->buffer, event); return 0; } static __init int rb_test(void *arg) { struct rb_test_data *data = arg; while (!kthread_should_stop()) { rb_write_something(data, false); data->cnt++; set_current_state(TASK_INTERRUPTIBLE); /* Now sleep between a min of 100-300us and a max of 1ms */ usleep_range(((data->cnt % 3) + 1) * 100, 1000); } return 0; } static __init void rb_ipi(void *ignore) { struct rb_test_data *data; int cpu = smp_processor_id(); data = &rb_data[cpu]; rb_write_something(data, true); } static __init int rb_hammer_test(void *arg) { while (!kthread_should_stop()) { /* Send an IPI to all cpus to write data! */ smp_call_function(rb_ipi, NULL, 1); /* No sleep, but for non preempt, let others run */ schedule(); } return 0; } static __init int test_ringbuffer(void) { struct task_struct *rb_hammer; struct trace_buffer *buffer; int cpu; int ret = 0; if (security_locked_down(LOCKDOWN_TRACEFS)) { pr_warn("Lockdown is enabled, skipping ring buffer tests\n"); return 0; } pr_info("Running ring buffer tests...\n"); buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE); if (WARN_ON(!buffer)) return 0; /* Disable buffer so that threads can't write to it yet */ ring_buffer_record_off(buffer); for_each_online_cpu(cpu) { rb_data[cpu].buffer = buffer; rb_data[cpu].cpu = cpu; rb_data[cpu].cnt = cpu; rb_threads[cpu] = kthread_run_on_cpu(rb_test, &rb_data[cpu], cpu, "rbtester/%u"); if (WARN_ON(IS_ERR(rb_threads[cpu]))) { pr_cont("FAILED\n"); ret = PTR_ERR(rb_threads[cpu]); goto out_free; } } /* Now create the rb hammer! */ rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer"); if (WARN_ON(IS_ERR(rb_hammer))) { pr_cont("FAILED\n"); ret = PTR_ERR(rb_hammer); goto out_free; } ring_buffer_record_on(buffer); /* * Show buffer is enabled before setting rb_test_started. * Yes there's a small race window where events could be * dropped and the thread wont catch it. But when a ring * buffer gets enabled, there will always be some kind of * delay before other CPUs see it. Thus, we don't care about * those dropped events. We care about events dropped after * the threads see that the buffer is active. */ smp_wmb(); rb_test_started = true; set_current_state(TASK_INTERRUPTIBLE); /* Just run for 10 seconds */; schedule_timeout(10 * HZ); kthread_stop(rb_hammer); out_free: for_each_online_cpu(cpu) { if (!rb_threads[cpu]) break; kthread_stop(rb_threads[cpu]); } if (ret) { ring_buffer_free(buffer); return ret; } /* Report! */ pr_info("finished\n"); for_each_online_cpu(cpu) { struct ring_buffer_event *event; struct rb_test_data *data = &rb_data[cpu]; struct rb_item *item; unsigned long total_events; unsigned long total_dropped; unsigned long total_written; unsigned long total_alloc; unsigned long total_read = 0; unsigned long total_size = 0; unsigned long total_len = 0; unsigned long total_lost = 0; unsigned long lost; int big_event_size; int small_event_size; ret = -1; total_events = data->events + data->events_nested; total_written = data->bytes_written + data->bytes_written_nested; total_alloc = data->bytes_alloc + data->bytes_alloc_nested; total_dropped = data->bytes_dropped + data->bytes_dropped_nested; big_event_size = data->max_size + data->max_size_nested; small_event_size = data->min_size + data->min_size_nested; pr_info("CPU %d:\n", cpu); pr_info(" events: %ld\n", total_events); pr_info(" dropped bytes: %ld\n", total_dropped); pr_info(" alloced bytes: %ld\n", total_alloc); pr_info(" written bytes: %ld\n", total_written); pr_info(" biggest event: %d\n", big_event_size); pr_info(" smallest event: %d\n", small_event_size); if (RB_WARN_ON(buffer, total_dropped)) break; ret = 0; while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) { total_lost += lost; item = ring_buffer_event_data(event); total_len += ring_buffer_event_length(event); total_size += item->size + sizeof(struct rb_item); if (memcmp(&item->str[0], rb_string, item->size) != 0) { pr_info("FAILED!\n"); pr_info("buffer had: %.*s\n", item->size, item->str); pr_info("expected: %.*s\n", item->size, rb_string); RB_WARN_ON(buffer, 1); ret = -1; break; } total_read++; } if (ret) break; ret = -1; pr_info(" read events: %ld\n", total_read); pr_info(" lost events: %ld\n", total_lost); pr_info(" total events: %ld\n", total_lost + total_read); pr_info(" recorded len bytes: %ld\n", total_len); pr_info(" recorded size bytes: %ld\n", total_size); if (total_lost) { pr_info(" With dropped events, record len and size may not match\n" " alloced and written from above\n"); } else { if (RB_WARN_ON(buffer, total_len != total_alloc || total_size != total_written)) break; } if (RB_WARN_ON(buffer, total_lost + total_read != total_events)) break; ret = 0; } if (!ret) pr_info("Ring buffer PASSED!\n"); ring_buffer_free(buffer); return 0; } late_initcall(test_ringbuffer); #endif /* CONFIG_RING_BUFFER_STARTUP_TEST */ |
5 3 8 8 3 4 1 8 8 8 8 8 18 2 18 8 8 6 2 6 1 20 2 18 2 1 1 8 8 8 1 8 7 1 8 8 5 3 8 8 8 17 14 2 1 12 13 10 14 13 15 10 44 44 44 14 36 44 19 27 3 1 27 5 23 28 7 21 3 8 1 8 8 8 8 5 15 36 8 8 8 8 37 34 12 39 40 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 | /* * cdc_ncm.c * * Copyright (C) ST-Ericsson 2010-2012 * Contact: Alexey Orishko <alexey.orishko@stericsson.com> * Original author: Hans Petter Selasky <hans.petter.selasky@stericsson.com> * * USB Host Driver for Network Control Model (NCM) * http://www.usb.org/developers/docs/devclass_docs/NCM10_012011.zip * * The NCM encoding, decoding and initialization logic * derives from FreeBSD 8.x. if_cdce.c and if_cdcereg.h * * This software is available to you under a choice of one of two * licenses. You may choose this file to be licensed under the terms * of the GNU General Public License (GPL) Version 2 or the 2-clause * BSD license listed below: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <linux/module.h> #include <linux/netdevice.h> #include <linux/ctype.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/workqueue.h> #include <linux/mii.h> #include <linux/crc32.h> #include <linux/usb.h> #include <linux/hrtimer.h> #include <linux/atomic.h> #include <linux/usb/usbnet.h> #include <linux/usb/cdc.h> #include <linux/usb/cdc_ncm.h> #if IS_ENABLED(CONFIG_USB_NET_CDC_MBIM) static bool prefer_mbim = true; #else static bool prefer_mbim; #endif module_param(prefer_mbim, bool, 0644); MODULE_PARM_DESC(prefer_mbim, "Prefer MBIM setting on dual NCM/MBIM functions"); static void cdc_ncm_txpath_bh(struct tasklet_struct *t); static void cdc_ncm_tx_timeout_start(struct cdc_ncm_ctx *ctx); static enum hrtimer_restart cdc_ncm_tx_timer_cb(struct hrtimer *hr_timer); static struct usb_driver cdc_ncm_driver; struct cdc_ncm_stats { char stat_string[ETH_GSTRING_LEN]; int sizeof_stat; int stat_offset; }; #define CDC_NCM_STAT(str, m) { \ .stat_string = str, \ .sizeof_stat = sizeof(((struct cdc_ncm_ctx *)0)->m), \ .stat_offset = offsetof(struct cdc_ncm_ctx, m) } #define CDC_NCM_SIMPLE_STAT(m) CDC_NCM_STAT(__stringify(m), m) static const struct cdc_ncm_stats cdc_ncm_gstrings_stats[] = { CDC_NCM_SIMPLE_STAT(tx_reason_ntb_full), CDC_NCM_SIMPLE_STAT(tx_reason_ndp_full), CDC_NCM_SIMPLE_STAT(tx_reason_timeout), CDC_NCM_SIMPLE_STAT(tx_reason_max_datagram), CDC_NCM_SIMPLE_STAT(tx_overhead), CDC_NCM_SIMPLE_STAT(tx_ntbs), CDC_NCM_SIMPLE_STAT(rx_overhead), CDC_NCM_SIMPLE_STAT(rx_ntbs), }; #define CDC_NCM_LOW_MEM_MAX_CNT 10 static int cdc_ncm_get_sset_count(struct net_device __always_unused *netdev, int sset) { switch (sset) { case ETH_SS_STATS: return ARRAY_SIZE(cdc_ncm_gstrings_stats); default: return -EOPNOTSUPP; } } static void cdc_ncm_get_ethtool_stats(struct net_device *netdev, struct ethtool_stats __always_unused *stats, u64 *data) { struct usbnet *dev = netdev_priv(netdev); struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; int i; char *p = NULL; for (i = 0; i < ARRAY_SIZE(cdc_ncm_gstrings_stats); i++) { p = (char *)ctx + cdc_ncm_gstrings_stats[i].stat_offset; data[i] = (cdc_ncm_gstrings_stats[i].sizeof_stat == sizeof(u64)) ? *(u64 *)p : *(u32 *)p; } } static void cdc_ncm_get_strings(struct net_device __always_unused *netdev, u32 stringset, u8 *data) { u8 *p = data; int i; switch (stringset) { case ETH_SS_STATS: for (i = 0; i < ARRAY_SIZE(cdc_ncm_gstrings_stats); i++) { memcpy(p, cdc_ncm_gstrings_stats[i].stat_string, ETH_GSTRING_LEN); p += ETH_GSTRING_LEN; } } } static void cdc_ncm_update_rxtx_max(struct usbnet *dev, u32 new_rx, u32 new_tx); static const struct ethtool_ops cdc_ncm_ethtool_ops = { .get_link = usbnet_get_link, .nway_reset = usbnet_nway_reset, .get_drvinfo = usbnet_get_drvinfo, .get_msglevel = usbnet_get_msglevel, .set_msglevel = usbnet_set_msglevel, .get_ts_info = ethtool_op_get_ts_info, .get_sset_count = cdc_ncm_get_sset_count, .get_strings = cdc_ncm_get_strings, .get_ethtool_stats = cdc_ncm_get_ethtool_stats, .get_link_ksettings = usbnet_get_link_ksettings_internal, .set_link_ksettings = NULL, }; static u32 cdc_ncm_check_rx_max(struct usbnet *dev, u32 new_rx) { struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; u32 val, max, min; /* clamp new_rx to sane values */ min = USB_CDC_NCM_NTB_MIN_IN_SIZE; max = min_t(u32, CDC_NCM_NTB_MAX_SIZE_RX, le32_to_cpu(ctx->ncm_parm.dwNtbInMaxSize)); /* dwNtbInMaxSize spec violation? Use MIN size for both limits */ if (max < min) { dev_warn(&dev->intf->dev, "dwNtbInMaxSize=%u is too small. Using %u\n", le32_to_cpu(ctx->ncm_parm.dwNtbInMaxSize), min); max = min; } val = clamp_t(u32, new_rx, min, max); if (val != new_rx) dev_dbg(&dev->intf->dev, "rx_max must be in the [%u, %u] range\n", min, max); return val; } static u32 cdc_ncm_check_tx_max(struct usbnet *dev, u32 new_tx) { struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; u32 val, max, min; /* clamp new_tx to sane values */ if (ctx->is_ndp16) min = ctx->max_datagram_size + ctx->max_ndp_size + sizeof(struct usb_cdc_ncm_nth16); else min = ctx->max_datagram_size + ctx->max_ndp_size + sizeof(struct usb_cdc_ncm_nth32); if (le32_to_cpu(ctx->ncm_parm.dwNtbOutMaxSize) == 0) max = CDC_NCM_NTB_MAX_SIZE_TX; /* dwNtbOutMaxSize not set */ else max = clamp_t(u32, le32_to_cpu(ctx->ncm_parm.dwNtbOutMaxSize), USB_CDC_NCM_NTB_MIN_OUT_SIZE, CDC_NCM_NTB_MAX_SIZE_TX); /* some devices set dwNtbOutMaxSize too low for the above default */ min = min(min, max); val = clamp_t(u32, new_tx, min, max); if (val != new_tx) dev_dbg(&dev->intf->dev, "tx_max must be in the [%u, %u] range\n", min, max); return val; } static ssize_t min_tx_pkt_show(struct device *d, struct device_attribute *attr, char *buf) { struct usbnet *dev = netdev_priv(to_net_dev(d)); struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; return sprintf(buf, "%u\n", ctx->min_tx_pkt); } static ssize_t rx_max_show(struct device *d, struct device_attribute *attr, char *buf) { struct usbnet *dev = netdev_priv(to_net_dev(d)); struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; return sprintf(buf, "%u\n", ctx->rx_max); } static ssize_t tx_max_show(struct device *d, struct device_attribute *attr, char *buf) { struct usbnet *dev = netdev_priv(to_net_dev(d)); struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; return sprintf(buf, "%u\n", ctx->tx_max); } static ssize_t tx_timer_usecs_show(struct device *d, struct device_attribute *attr, char *buf) { struct usbnet *dev = netdev_priv(to_net_dev(d)); struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; return sprintf(buf, "%u\n", ctx->timer_interval / (u32)NSEC_PER_USEC); } static ssize_t min_tx_pkt_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { struct usbnet *dev = netdev_priv(to_net_dev(d)); struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; unsigned long val; /* no need to restrict values - anything from 0 to infinity is OK */ if (kstrtoul(buf, 0, &val)) return -EINVAL; ctx->min_tx_pkt = val; return len; } static ssize_t rx_max_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { struct usbnet *dev = netdev_priv(to_net_dev(d)); struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; unsigned long val; if (kstrtoul(buf, 0, &val) || cdc_ncm_check_rx_max(dev, val) != val) return -EINVAL; cdc_ncm_update_rxtx_max(dev, val, ctx->tx_max); return len; } static ssize_t tx_max_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { struct usbnet *dev = netdev_priv(to_net_dev(d)); struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; unsigned long val; if (kstrtoul(buf, 0, &val) || cdc_ncm_check_tx_max(dev, val) != val) return -EINVAL; cdc_ncm_update_rxtx_max(dev, ctx->rx_max, val); return len; } static ssize_t tx_timer_usecs_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { struct usbnet *dev = netdev_priv(to_net_dev(d)); struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; ssize_t ret; unsigned long val; ret = kstrtoul(buf, 0, &val); if (ret) return ret; if (val && (val < CDC_NCM_TIMER_INTERVAL_MIN || val > CDC_NCM_TIMER_INTERVAL_MAX)) return -EINVAL; spin_lock_bh(&ctx->mtx); ctx->timer_interval = val * NSEC_PER_USEC; if (!ctx->timer_interval) ctx->tx_timer_pending = 0; spin_unlock_bh(&ctx->mtx); return len; } static DEVICE_ATTR_RW(min_tx_pkt); static DEVICE_ATTR_RW(rx_max); static DEVICE_ATTR_RW(tx_max); static DEVICE_ATTR_RW(tx_timer_usecs); static ssize_t ndp_to_end_show(struct device *d, struct device_attribute *attr, char *buf) { struct usbnet *dev = netdev_priv(to_net_dev(d)); struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; return sprintf(buf, "%c\n", ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END ? 'Y' : 'N'); } static ssize_t ndp_to_end_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { struct usbnet *dev = netdev_priv(to_net_dev(d)); struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; bool enable; if (strtobool(buf, &enable)) return -EINVAL; /* no change? */ if (enable == (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END)) return len; if (enable) { if (ctx->is_ndp16 && !ctx->delayed_ndp16) { ctx->delayed_ndp16 = kzalloc(ctx->max_ndp_size, GFP_KERNEL); if (!ctx->delayed_ndp16) return -ENOMEM; } if (!ctx->is_ndp16 && !ctx->delayed_ndp32) { ctx->delayed_ndp32 = kzalloc(ctx->max_ndp_size, GFP_KERNEL); if (!ctx->delayed_ndp32) return -ENOMEM; } } /* flush pending data before changing flag */ netif_tx_lock_bh(dev->net); usbnet_start_xmit(NULL, dev->net); spin_lock_bh(&ctx->mtx); if (enable) ctx->drvflags |= CDC_NCM_FLAG_NDP_TO_END; else ctx->drvflags &= ~CDC_NCM_FLAG_NDP_TO_END; spin_unlock_bh(&ctx->mtx); netif_tx_unlock_bh(dev->net); return len; } static DEVICE_ATTR_RW(ndp_to_end); #define NCM_PARM_ATTR(name, format, tocpu) \ static ssize_t cdc_ncm_show_##name(struct device *d, struct device_attribute *attr, char *buf) \ { \ struct usbnet *dev = netdev_priv(to_net_dev(d)); \ struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; \ return sprintf(buf, format "\n", tocpu(ctx->ncm_parm.name)); \ } \ static DEVICE_ATTR(name, 0444, cdc_ncm_show_##name, NULL) NCM_PARM_ATTR(bmNtbFormatsSupported, "0x%04x", le16_to_cpu); NCM_PARM_ATTR(dwNtbInMaxSize, "%u", le32_to_cpu); NCM_PARM_ATTR(wNdpInDivisor, "%u", le16_to_cpu); NCM_PARM_ATTR(wNdpInPayloadRemainder, "%u", le16_to_cpu); NCM_PARM_ATTR(wNdpInAlignment, "%u", le16_to_cpu); NCM_PARM_ATTR(dwNtbOutMaxSize, "%u", le32_to_cpu); NCM_PARM_ATTR(wNdpOutDivisor, "%u", le16_to_cpu); NCM_PARM_ATTR(wNdpOutPayloadRemainder, "%u", le16_to_cpu); NCM_PARM_ATTR(wNdpOutAlignment, "%u", le16_to_cpu); NCM_PARM_ATTR(wNtbOutMaxDatagrams, "%u", le16_to_cpu); static struct attribute *cdc_ncm_sysfs_attrs[] = { &dev_attr_min_tx_pkt.attr, &dev_attr_ndp_to_end.attr, &dev_attr_rx_max.attr, &dev_attr_tx_max.attr, &dev_attr_tx_timer_usecs.attr, &dev_attr_bmNtbFormatsSupported.attr, &dev_attr_dwNtbInMaxSize.attr, &dev_attr_wNdpInDivisor.attr, &dev_attr_wNdpInPayloadRemainder.attr, &dev_attr_wNdpInAlignment.attr, &dev_attr_dwNtbOutMaxSize.attr, &dev_attr_wNdpOutDivisor.attr, &dev_attr_wNdpOutPayloadRemainder.attr, &dev_attr_wNdpOutAlignment.attr, &dev_attr_wNtbOutMaxDatagrams.attr, NULL, }; static const struct attribute_group cdc_ncm_sysfs_attr_group = { .name = "cdc_ncm", .attrs = cdc_ncm_sysfs_attrs, }; /* handle rx_max and tx_max changes */ static void cdc_ncm_update_rxtx_max(struct usbnet *dev, u32 new_rx, u32 new_tx) { struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; u8 iface_no = ctx->control->cur_altsetting->desc.bInterfaceNumber; u32 val; val = cdc_ncm_check_rx_max(dev, new_rx); /* inform device about NTB input size changes */ if (val != ctx->rx_max) { __le32 dwNtbInMaxSize = cpu_to_le32(val); dev_info(&dev->intf->dev, "setting rx_max = %u\n", val); /* tell device to use new size */ if (usbnet_write_cmd(dev, USB_CDC_SET_NTB_INPUT_SIZE, USB_TYPE_CLASS | USB_DIR_OUT | USB_RECIP_INTERFACE, 0, iface_no, &dwNtbInMaxSize, 4) < 0) dev_dbg(&dev->intf->dev, "Setting NTB Input Size failed\n"); else ctx->rx_max = val; } /* usbnet use these values for sizing rx queues */ if (dev->rx_urb_size != ctx->rx_max) { dev->rx_urb_size = ctx->rx_max; if (netif_running(dev->net)) usbnet_unlink_rx_urbs(dev); } val = cdc_ncm_check_tx_max(dev, new_tx); if (val != ctx->tx_max) dev_info(&dev->intf->dev, "setting tx_max = %u\n", val); /* Adding a pad byte here if necessary simplifies the handling * in cdc_ncm_fill_tx_frame, making tx_max always represent * the real skb max size. * * We cannot use dev->maxpacket here because this is called from * .bind which is called before usbnet sets up dev->maxpacket */ if (val != le32_to_cpu(ctx->ncm_parm.dwNtbOutMaxSize) && val % usb_maxpacket(dev->udev, dev->out) == 0) val++; /* we might need to flush any pending tx buffers if running */ if (netif_running(dev->net) && val > ctx->tx_max) { netif_tx_lock_bh(dev->net); usbnet_start_xmit(NULL, dev->net); /* make sure tx_curr_skb is reallocated if it was empty */ if (ctx->tx_curr_skb) { dev_kfree_skb_any(ctx->tx_curr_skb); ctx->tx_curr_skb = NULL; } ctx->tx_max = val; netif_tx_unlock_bh(dev->net); } else { ctx->tx_max = val; } dev->hard_mtu = ctx->tx_max; /* max qlen depend on hard_mtu and rx_urb_size */ usbnet_update_max_qlen(dev); /* never pad more than 3 full USB packets per transfer */ ctx->min_tx_pkt = clamp_t(u16, ctx->tx_max - 3 * usb_maxpacket(dev->udev, dev->out), CDC_NCM_MIN_TX_PKT, ctx->tx_max); } /* helpers for NCM and MBIM differences */ static u8 cdc_ncm_flags(struct usbnet *dev) { struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; if (cdc_ncm_comm_intf_is_mbim(dev->intf->cur_altsetting) && ctx->mbim_desc) return ctx->mbim_desc->bmNetworkCapabilities; if (ctx->func_desc) return ctx->func_desc->bmNetworkCapabilities; return 0; } static int cdc_ncm_eth_hlen(struct usbnet *dev) { if (cdc_ncm_comm_intf_is_mbim(dev->intf->cur_altsetting)) return 0; return ETH_HLEN; } static u32 cdc_ncm_min_dgram_size(struct usbnet *dev) { if (cdc_ncm_comm_intf_is_mbim(dev->intf->cur_altsetting)) return CDC_MBIM_MIN_DATAGRAM_SIZE; return CDC_NCM_MIN_DATAGRAM_SIZE; } static u32 cdc_ncm_max_dgram_size(struct usbnet *dev) { struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; if (cdc_ncm_comm_intf_is_mbim(dev->intf->cur_altsetting) && ctx->mbim_desc) return le16_to_cpu(ctx->mbim_desc->wMaxSegmentSize); if (ctx->ether_desc) return le16_to_cpu(ctx->ether_desc->wMaxSegmentSize); return CDC_NCM_MAX_DATAGRAM_SIZE; } /* initial one-time device setup. MUST be called with the data interface * in altsetting 0 */ static int cdc_ncm_init(struct usbnet *dev) { struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; u8 iface_no = ctx->control->cur_altsetting->desc.bInterfaceNumber; int err; err = usbnet_read_cmd(dev, USB_CDC_GET_NTB_PARAMETERS, USB_TYPE_CLASS | USB_DIR_IN |USB_RECIP_INTERFACE, 0, iface_no, &ctx->ncm_parm, sizeof(ctx->ncm_parm)); if (err < 0) { dev_err(&dev->intf->dev, "failed GET_NTB_PARAMETERS\n"); return err; /* GET_NTB_PARAMETERS is required */ } /* set CRC Mode */ if (cdc_ncm_flags(dev) & USB_CDC_NCM_NCAP_CRC_MODE) { dev_dbg(&dev->intf->dev, "Setting CRC mode off\n"); err = usbnet_write_cmd(dev, USB_CDC_SET_CRC_MODE, USB_TYPE_CLASS | USB_DIR_OUT | USB_RECIP_INTERFACE, USB_CDC_NCM_CRC_NOT_APPENDED, iface_no, NULL, 0); if (err < 0) dev_err(&dev->intf->dev, "SET_CRC_MODE failed\n"); } /* use ndp16 by default */ ctx->is_ndp16 = 1; /* set NTB format, if both formats are supported. * * "The host shall only send this command while the NCM Data * Interface is in alternate setting 0." */ if (le16_to_cpu(ctx->ncm_parm.bmNtbFormatsSupported) & USB_CDC_NCM_NTB32_SUPPORTED) { if (ctx->drvflags & CDC_NCM_FLAG_PREFER_NTB32) { ctx->is_ndp16 = 0; dev_dbg(&dev->intf->dev, "Setting NTB format to 32-bit\n"); err = usbnet_write_cmd(dev, USB_CDC_SET_NTB_FORMAT, USB_TYPE_CLASS | USB_DIR_OUT | USB_RECIP_INTERFACE, USB_CDC_NCM_NTB32_FORMAT, iface_no, NULL, 0); } else { ctx->is_ndp16 = 1; dev_dbg(&dev->intf->dev, "Setting NTB format to 16-bit\n"); err = usbnet_write_cmd(dev, USB_CDC_SET_NTB_FORMAT, USB_TYPE_CLASS | USB_DIR_OUT | USB_RECIP_INTERFACE, USB_CDC_NCM_NTB16_FORMAT, iface_no, NULL, 0); } if (err < 0) { ctx->is_ndp16 = 1; dev_err(&dev->intf->dev, "SET_NTB_FORMAT failed\n"); } } /* set initial device values */ ctx->rx_max = le32_to_cpu(ctx->ncm_parm.dwNtbInMaxSize); ctx->tx_max = le32_to_cpu(ctx->ncm_parm.dwNtbOutMaxSize); ctx->tx_remainder = le16_to_cpu(ctx->ncm_parm.wNdpOutPayloadRemainder); ctx->tx_modulus = le16_to_cpu(ctx->ncm_parm.wNdpOutDivisor); ctx->tx_ndp_modulus = le16_to_cpu(ctx->ncm_parm.wNdpOutAlignment); /* devices prior to NCM Errata shall set this field to zero */ ctx->tx_max_datagrams = le16_to_cpu(ctx->ncm_parm.wNtbOutMaxDatagrams); dev_dbg(&dev->intf->dev, "dwNtbInMaxSize=%u dwNtbOutMaxSize=%u wNdpOutPayloadRemainder=%u wNdpOutDivisor=%u wNdpOutAlignment=%u wNtbOutMaxDatagrams=%u flags=0x%x\n", ctx->rx_max, ctx->tx_max, ctx->tx_remainder, ctx->tx_modulus, ctx->tx_ndp_modulus, ctx->tx_max_datagrams, cdc_ncm_flags(dev)); /* max count of tx datagrams */ if ((ctx->tx_max_datagrams == 0) || (ctx->tx_max_datagrams > CDC_NCM_DPT_DATAGRAMS_MAX)) ctx->tx_max_datagrams = CDC_NCM_DPT_DATAGRAMS_MAX; /* set up maximum NDP size */ if (ctx->is_ndp16) ctx->max_ndp_size = sizeof(struct usb_cdc_ncm_ndp16) + (ctx->tx_max_datagrams + 1) * sizeof(struct usb_cdc_ncm_dpe16); else ctx->max_ndp_size = sizeof(struct usb_cdc_ncm_ndp32) + (ctx->tx_max_datagrams + 1) * sizeof(struct usb_cdc_ncm_dpe32); /* initial coalescing timer interval */ ctx->timer_interval = CDC_NCM_TIMER_INTERVAL_USEC * NSEC_PER_USEC; return 0; } /* set a new max datagram size */ static void cdc_ncm_set_dgram_size(struct usbnet *dev, int new_size) { struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; u8 iface_no = ctx->control->cur_altsetting->desc.bInterfaceNumber; __le16 max_datagram_size; u16 mbim_mtu; int err; /* set default based on descriptors */ ctx->max_datagram_size = clamp_t(u32, new_size, cdc_ncm_min_dgram_size(dev), CDC_NCM_MAX_DATAGRAM_SIZE); /* inform the device about the selected Max Datagram Size? */ if (!(cdc_ncm_flags(dev) & USB_CDC_NCM_NCAP_MAX_DATAGRAM_SIZE)) goto out; /* read current mtu value from device */ err = usbnet_read_cmd(dev, USB_CDC_GET_MAX_DATAGRAM_SIZE, USB_TYPE_CLASS | USB_DIR_IN | USB_RECIP_INTERFACE, 0, iface_no, &max_datagram_size, sizeof(max_datagram_size)); if (err != sizeof(max_datagram_size)) { dev_dbg(&dev->intf->dev, "GET_MAX_DATAGRAM_SIZE failed\n"); goto out; } if (le16_to_cpu(max_datagram_size) == ctx->max_datagram_size) goto out; max_datagram_size = cpu_to_le16(ctx->max_datagram_size); err = usbnet_write_cmd(dev, USB_CDC_SET_MAX_DATAGRAM_SIZE, USB_TYPE_CLASS | USB_DIR_OUT | USB_RECIP_INTERFACE, 0, iface_no, &max_datagram_size, sizeof(max_datagram_size)); if (err < 0) dev_dbg(&dev->intf->dev, "SET_MAX_DATAGRAM_SIZE failed\n"); out: /* set MTU to max supported by the device if necessary */ dev->net->mtu = min_t(int, dev->net->mtu, ctx->max_datagram_size - cdc_ncm_eth_hlen(dev)); /* do not exceed operator preferred MTU */ if (ctx->mbim_extended_desc) { mbim_mtu = le16_to_cpu(ctx->mbim_extended_desc->wMTU); if (mbim_mtu != 0 && mbim_mtu < dev->net->mtu) dev->net->mtu = mbim_mtu; } } static void cdc_ncm_fix_modulus(struct usbnet *dev) { struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; u32 val; /* * verify that the structure alignment is: * - power of two * - not greater than the maximum transmit length * - not less than four bytes */ val = ctx->tx_ndp_modulus; if ((val < USB_CDC_NCM_NDP_ALIGN_MIN_SIZE) || (val != ((-val) & val)) || (val >= ctx->tx_max)) { dev_dbg(&dev->intf->dev, "Using default alignment: 4 bytes\n"); ctx->tx_ndp_modulus = USB_CDC_NCM_NDP_ALIGN_MIN_SIZE; } /* * verify that the payload alignment is: * - power of two * - not greater than the maximum transmit length * - not less than four bytes */ val = ctx->tx_modulus; if ((val < USB_CDC_NCM_NDP_ALIGN_MIN_SIZE) || (val != ((-val) & val)) || (val >= ctx->tx_max)) { dev_dbg(&dev->intf->dev, "Using default transmit modulus: 4 bytes\n"); ctx->tx_modulus = USB_CDC_NCM_NDP_ALIGN_MIN_SIZE; } /* verify the payload remainder */ if (ctx->tx_remainder >= ctx->tx_modulus) { dev_dbg(&dev->intf->dev, "Using default transmit remainder: 0 bytes\n"); ctx->tx_remainder = 0; } /* adjust TX-remainder according to NCM specification. */ ctx->tx_remainder = ((ctx->tx_remainder - cdc_ncm_eth_hlen(dev)) & (ctx->tx_modulus - 1)); } static int cdc_ncm_setup(struct usbnet *dev) { struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; u32 def_rx, def_tx; /* be conservative when selecting initial buffer size to * increase the number of hosts this will work for */ def_rx = min_t(u32, CDC_NCM_NTB_DEF_SIZE_RX, le32_to_cpu(ctx->ncm_parm.dwNtbInMaxSize)); def_tx = min_t(u32, CDC_NCM_NTB_DEF_SIZE_TX, le32_to_cpu(ctx->ncm_parm.dwNtbOutMaxSize)); /* clamp rx_max and tx_max and inform device */ cdc_ncm_update_rxtx_max(dev, def_rx, def_tx); /* sanitize the modulus and remainder values */ cdc_ncm_fix_modulus(dev); /* set max datagram size */ cdc_ncm_set_dgram_size(dev, cdc_ncm_max_dgram_size(dev)); return 0; } static void cdc_ncm_find_endpoints(struct usbnet *dev, struct usb_interface *intf) { struct usb_host_endpoint *e, *in = NULL, *out = NULL; u8 ep; for (ep = 0; ep < intf->cur_altsetting->desc.bNumEndpoints; ep++) { e = intf->cur_altsetting->endpoint + ep; /* ignore endpoints which cannot transfer data */ if (!usb_endpoint_maxp(&e->desc)) continue; switch (e->desc.bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) { case USB_ENDPOINT_XFER_INT: if (usb_endpoint_dir_in(&e->desc)) { if (!dev->status) dev->status = e; } break; case USB_ENDPOINT_XFER_BULK: if (usb_endpoint_dir_in(&e->desc)) { if (!in) in = e; } else { if (!out) out = e; } break; default: break; } } if (in && !dev->in) dev->in = usb_rcvbulkpipe(dev->udev, in->desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); if (out && !dev->out) dev->out = usb_sndbulkpipe(dev->udev, out->desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); } static void cdc_ncm_free(struct cdc_ncm_ctx *ctx) { if (ctx == NULL) return; if (ctx->tx_rem_skb != NULL) { dev_kfree_skb_any(ctx->tx_rem_skb); ctx->tx_rem_skb = NULL; } if (ctx->tx_curr_skb != NULL) { dev_kfree_skb_any(ctx->tx_curr_skb); ctx->tx_curr_skb = NULL; } if (ctx->is_ndp16) kfree(ctx->delayed_ndp16); else kfree(ctx->delayed_ndp32); kfree(ctx); } /* we need to override the usbnet change_mtu ndo for two reasons: * - respect the negotiated maximum datagram size * - avoid unwanted changes to rx and tx buffers */ int cdc_ncm_change_mtu(struct net_device *net, int new_mtu) { struct usbnet *dev = netdev_priv(net); net->mtu = new_mtu; cdc_ncm_set_dgram_size(dev, new_mtu + cdc_ncm_eth_hlen(dev)); return 0; } EXPORT_SYMBOL_GPL(cdc_ncm_change_mtu); static const struct net_device_ops cdc_ncm_netdev_ops = { .ndo_open = usbnet_open, .ndo_stop = usbnet_stop, .ndo_start_xmit = usbnet_start_xmit, .ndo_tx_timeout = usbnet_tx_timeout, .ndo_set_rx_mode = usbnet_set_rx_mode, .ndo_get_stats64 = dev_get_tstats64, .ndo_change_mtu = cdc_ncm_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_altsetting, int drvflags) { struct cdc_ncm_ctx *ctx; struct usb_driver *driver; u8 *buf; int len; int temp; u8 iface_no; struct usb_cdc_parsed_header hdr; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->dev = dev; hrtimer_init(&ctx->tx_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ctx->tx_timer.function = &cdc_ncm_tx_timer_cb; tasklet_setup(&ctx->bh, cdc_ncm_txpath_bh); atomic_set(&ctx->stop, 0); spin_lock_init(&ctx->mtx); /* store ctx pointer in device data field */ dev->data[0] = (unsigned long)ctx; /* only the control interface can be successfully probed */ ctx->control = intf; /* get some pointers */ driver = driver_of(intf); buf = intf->cur_altsetting->extra; len = intf->cur_altsetting->extralen; /* parse through descriptors associated with control interface */ cdc_parse_cdc_header(&hdr, intf, buf, len); if (hdr.usb_cdc_union_desc) ctx->data = usb_ifnum_to_if(dev->udev, hdr.usb_cdc_union_desc->bSlaveInterface0); ctx->ether_desc = hdr.usb_cdc_ether_desc; ctx->func_desc = hdr.usb_cdc_ncm_desc; ctx->mbim_desc = hdr.usb_cdc_mbim_desc; ctx->mbim_extended_desc = hdr.usb_cdc_mbim_extended_desc; /* some buggy devices have an IAD but no CDC Union */ if (!hdr.usb_cdc_union_desc && intf->intf_assoc && intf->intf_assoc->bInterfaceCount == 2) { ctx->data = usb_ifnum_to_if(dev->udev, intf->cur_altsetting->desc.bInterfaceNumber + 1); dev_dbg(&intf->dev, "CDC Union missing - got slave from IAD\n"); } /* check if we got everything */ if (!ctx->data) { dev_err(&intf->dev, "CDC Union missing and no IAD found\n"); goto error; } if (cdc_ncm_comm_intf_is_mbim(intf->cur_altsetting)) { if (!ctx->mbim_desc) { dev_err(&intf->dev, "MBIM functional descriptor missing\n"); goto error; } } else { if (!ctx->ether_desc || !ctx->func_desc) { dev_err(&intf->dev, "NCM or ECM functional descriptors missing\n"); goto error; } } /* claim data interface, if different from control */ if (ctx->data != ctx->control) { temp = usb_driver_claim_interface(driver, ctx->data, dev); if (temp) { dev_err(&intf->dev, "failed to claim data intf\n"); goto error; } } iface_no = ctx->data->cur_altsetting->desc.bInterfaceNumber; /* Device-specific flags */ ctx->drvflags = drvflags; /* Reset data interface. Some devices will not reset properly * unless they are configured first. Toggle the altsetting to * force a reset. * Some other devices do not work properly with this procedure * that can be avoided using quirk CDC_MBIM_FLAG_AVOID_ALTSETTING_TOGGLE */ if (!(ctx->drvflags & CDC_MBIM_FLAG_AVOID_ALTSETTING_TOGGLE)) usb_set_interface(dev->udev, iface_no, data_altsetting); temp = usb_set_interface(dev->udev, iface_no, 0); if (temp) { dev_dbg(&intf->dev, "set interface failed\n"); goto error2; } /* initialize basic device settings */ if (cdc_ncm_init(dev)) goto error2; /* Some firmwares need a pause here or they will silently fail * to set up the interface properly. This value was decided * empirically on a Sierra Wireless MC7455 running 02.08.02.00 * firmware. */ usleep_range(10000, 20000); /* configure data interface */ temp = usb_set_interface(dev->udev, iface_no, data_altsetting); if (temp) { dev_dbg(&intf->dev, "set interface failed\n"); goto error2; } cdc_ncm_find_endpoints(dev, ctx->data); cdc_ncm_find_endpoints(dev, ctx->control); if (!dev->in || !dev->out || !dev->status) { dev_dbg(&intf->dev, "failed to collect endpoints\n"); goto error2; } usb_set_intfdata(ctx->control, dev); if (ctx->ether_desc) { temp = usbnet_get_ethernet_addr(dev, ctx->ether_desc->iMACAddress); if (temp) { dev_err(&intf->dev, "failed to get mac address\n"); goto error2; } dev_info(&intf->dev, "MAC-Address: %pM\n", dev->net->dev_addr); } /* finish setting up the device specific data */ cdc_ncm_setup(dev); /* Allocate the delayed NDP if needed. */ if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END) { if (ctx->is_ndp16) { ctx->delayed_ndp16 = kzalloc(ctx->max_ndp_size, GFP_KERNEL); if (!ctx->delayed_ndp16) goto error2; } else { ctx->delayed_ndp32 = kzalloc(ctx->max_ndp_size, GFP_KERNEL); if (!ctx->delayed_ndp32) goto error2; } dev_info(&intf->dev, "NDP will be placed at end of frame for this device."); } /* override ethtool_ops */ dev->net->ethtool_ops = &cdc_ncm_ethtool_ops; /* add our sysfs attrs */ dev->net->sysfs_groups[0] = &cdc_ncm_sysfs_attr_group; /* must handle MTU changes */ dev->net->netdev_ops = &cdc_ncm_netdev_ops; dev->net->max_mtu = cdc_ncm_max_dgram_size(dev) - cdc_ncm_eth_hlen(dev); return 0; error2: usb_set_intfdata(ctx->control, NULL); usb_set_intfdata(ctx->data, NULL); if (ctx->data != ctx->control) usb_driver_release_interface(driver, ctx->data); error: cdc_ncm_free((struct cdc_ncm_ctx *)dev->data[0]); dev->data[0] = 0; dev_info(&intf->dev, "bind() failure\n"); return -ENODEV; } EXPORT_SYMBOL_GPL(cdc_ncm_bind_common); void cdc_ncm_unbind(struct usbnet *dev, struct usb_interface *intf) { struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; struct usb_driver *driver = driver_of(intf); if (ctx == NULL) return; /* no setup */ atomic_set(&ctx->stop, 1); hrtimer_cancel(&ctx->tx_timer); tasklet_kill(&ctx->bh); /* handle devices with combined control and data interface */ if (ctx->control == ctx->data) ctx->data = NULL; /* disconnect master --> disconnect slave */ if (intf == ctx->control && ctx->data) { usb_set_intfdata(ctx->data, NULL); usb_driver_release_interface(driver, ctx->data); ctx->data = NULL; } else if (intf == ctx->data && ctx->control) { usb_set_intfdata(ctx->control, NULL); usb_driver_release_interface(driver, ctx->control); ctx->control = NULL; } usb_set_intfdata(intf, NULL); cdc_ncm_free(ctx); } EXPORT_SYMBOL_GPL(cdc_ncm_unbind); /* Return the number of the MBIM control interface altsetting iff it * is preferred and available, */ u8 cdc_ncm_select_altsetting(struct usb_interface *intf) { struct usb_host_interface *alt; /* The MBIM spec defines a NCM compatible default altsetting, * which we may have matched: * * "Functions that implement both NCM 1.0 and MBIM (an * “NCM/MBIM function”) according to this recommendation * shall provide two alternate settings for the * Communication Interface. Alternate setting 0, and the * associated class and endpoint descriptors, shall be * constructed according to the rules given for the * Communication Interface in section 5 of [USBNCM10]. * Alternate setting 1, and the associated class and * endpoint descriptors, shall be constructed according to * the rules given in section 6 (USB Device Model) of this * specification." */ if (intf->num_altsetting < 2) return intf->cur_altsetting->desc.bAlternateSetting; if (prefer_mbim) { alt = usb_altnum_to_altsetting(intf, CDC_NCM_COMM_ALTSETTING_MBIM); if (alt && cdc_ncm_comm_intf_is_mbim(alt)) return CDC_NCM_COMM_ALTSETTING_MBIM; } return CDC_NCM_COMM_ALTSETTING_NCM; } EXPORT_SYMBOL_GPL(cdc_ncm_select_altsetting); static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf) { /* MBIM backwards compatible function? */ if (cdc_ncm_select_altsetting(intf) != CDC_NCM_COMM_ALTSETTING_NCM) return -ENODEV; /* The NCM data altsetting is fixed, so we hard-coded it. * Additionally, generic NCM devices are assumed to accept arbitrarily * placed NDP. */ return cdc_ncm_bind_common(dev, intf, CDC_NCM_DATA_ALTSETTING_NCM, 0); } static void cdc_ncm_align_tail(struct sk_buff *skb, size_t modulus, size_t remainder, size_t max) { size_t align = ALIGN(skb->len, modulus) - skb->len + remainder; if (skb->len + align > max) align = max - skb->len; if (align && skb_tailroom(skb) >= align) skb_put_zero(skb, align); } /* return a pointer to a valid struct usb_cdc_ncm_ndp16 of type sign, possibly * allocating a new one within skb */ static struct usb_cdc_ncm_ndp16 *cdc_ncm_ndp16(struct cdc_ncm_ctx *ctx, struct sk_buff *skb, __le32 sign, size_t reserve) { struct usb_cdc_ncm_ndp16 *ndp16 = NULL; struct usb_cdc_ncm_nth16 *nth16 = (void *)skb->data; size_t ndpoffset = le16_to_cpu(nth16->wNdpIndex); /* If NDP should be moved to the end of the NCM package, we can't follow the * NTH16 header as we would normally do. NDP isn't written to the SKB yet, and * the wNdpIndex field in the header is actually not consistent with reality. It will be later. */ if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END) { if (ctx->delayed_ndp16->dwSignature == sign) return ctx->delayed_ndp16; /* We can only push a single NDP to the end. Return * NULL to send what we've already got and queue this * skb for later. */ else if (ctx->delayed_ndp16->dwSignature) return NULL; } /* follow the chain of NDPs, looking for a match */ while (ndpoffset) { ndp16 = (struct usb_cdc_ncm_ndp16 *)(skb->data + ndpoffset); if (ndp16->dwSignature == sign) return ndp16; ndpoffset = le16_to_cpu(ndp16->wNextNdpIndex); } /* align new NDP */ if (!(ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END)) cdc_ncm_align_tail(skb, ctx->tx_ndp_modulus, 0, ctx->tx_curr_size); /* verify that there is room for the NDP and the datagram (reserve) */ if ((ctx->tx_curr_size - skb->len - reserve) < ctx->max_ndp_size) return NULL; /* link to it */ if (ndp16) ndp16->wNextNdpIndex = cpu_to_le16(skb->len); else nth16->wNdpIndex = cpu_to_le16(skb->len); /* push a new empty NDP */ if (!(ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END)) ndp16 = skb_put_zero(skb, ctx->max_ndp_size); else ndp16 = ctx->delayed_ndp16; ndp16->dwSignature = sign; ndp16->wLength = cpu_to_le16(sizeof(struct usb_cdc_ncm_ndp16) + sizeof(struct usb_cdc_ncm_dpe16)); return ndp16; } static struct usb_cdc_ncm_ndp32 *cdc_ncm_ndp32(struct cdc_ncm_ctx *ctx, struct sk_buff *skb, __le32 sign, size_t reserve) { struct usb_cdc_ncm_ndp32 *ndp32 = NULL; struct usb_cdc_ncm_nth32 *nth32 = (void *)skb->data; size_t ndpoffset = le32_to_cpu(nth32->dwNdpIndex); /* If NDP should be moved to the end of the NCM package, we can't follow the * NTH32 header as we would normally do. NDP isn't written to the SKB yet, and * the wNdpIndex field in the header is actually not consistent with reality. It will be later. */ if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END) { if (ctx->delayed_ndp32->dwSignature == sign) return ctx->delayed_ndp32; /* We can only push a single NDP to the end. Return * NULL to send what we've already got and queue this * skb for later. */ else if (ctx->delayed_ndp32->dwSignature) return NULL; } /* follow the chain of NDPs, looking for a match */ while (ndpoffset) { ndp32 = (struct usb_cdc_ncm_ndp32 *)(skb->data + ndpoffset); if (ndp32->dwSignature == sign) return ndp32; ndpoffset = le32_to_cpu(ndp32->dwNextNdpIndex); } /* align new NDP */ if (!(ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END)) cdc_ncm_align_tail(skb, ctx->tx_ndp_modulus, 0, ctx->tx_curr_size); /* verify that there is room for the NDP and the datagram (reserve) */ if ((ctx->tx_curr_size - skb->len - reserve) < ctx->max_ndp_size) return NULL; /* link to it */ if (ndp32) ndp32->dwNextNdpIndex = cpu_to_le32(skb->len); else nth32->dwNdpIndex = cpu_to_le32(skb->len); /* push a new empty NDP */ if (!(ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END)) ndp32 = skb_put_zero(skb, ctx->max_ndp_size); else ndp32 = ctx->delayed_ndp32; ndp32->dwSignature = sign; ndp32->wLength = cpu_to_le16(sizeof(struct usb_cdc_ncm_ndp32) + sizeof(struct usb_cdc_ncm_dpe32)); return ndp32; } struct sk_buff * cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign) { struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; union { struct usb_cdc_ncm_nth16 *nth16; struct usb_cdc_ncm_nth32 *nth32; } nth; union { struct usb_cdc_ncm_ndp16 *ndp16; struct usb_cdc_ncm_ndp32 *ndp32; } ndp; struct sk_buff *skb_out; u16 n = 0, index, ndplen; u8 ready2send = 0; u32 delayed_ndp_size; size_t padding_count; /* When our NDP gets written in cdc_ncm_ndp(), then skb_out->len gets updated * accordingly. Otherwise, we should check here. */ if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END) delayed_ndp_size = ctx->max_ndp_size + max_t(u32, ctx->tx_ndp_modulus, ctx->tx_modulus + ctx->tx_remainder) - 1; else delayed_ndp_size = 0; /* if there is a remaining skb, it gets priority */ if (skb != NULL) { swap(skb, ctx->tx_rem_skb); swap(sign, ctx->tx_rem_sign); } else { ready2send = 1; } /* check if we are resuming an OUT skb */ skb_out = ctx->tx_curr_skb; /* allocate a new OUT skb */ if (!skb_out) { if (ctx->tx_low_mem_val == 0) { ctx->tx_curr_size = ctx->tx_max; skb_out = alloc_skb(ctx->tx_curr_size, GFP_ATOMIC); /* If the memory allocation fails we will wait longer * each time before attempting another full size * allocation again to not overload the system * further. */ if (skb_out == NULL) { /* If even the smallest allocation fails, abort. */ if (ctx->tx_curr_size == USB_CDC_NCM_NTB_MIN_OUT_SIZE) goto alloc_failed; ctx->tx_low_mem_max_cnt = min(ctx->tx_low_mem_max_cnt + 1, (unsigned)CDC_NCM_LOW_MEM_MAX_CNT); ctx->tx_low_mem_val = ctx->tx_low_mem_max_cnt; } } if (skb_out == NULL) { /* See if a very small allocation is possible. * We will send this packet immediately and hope * that there is more memory available later. */ if (skb) ctx->tx_curr_size = max(skb->len, (u32)USB_CDC_NCM_NTB_MIN_OUT_SIZE); else ctx->tx_curr_size = USB_CDC_NCM_NTB_MIN_OUT_SIZE; skb_out = alloc_skb(ctx->tx_curr_size, GFP_ATOMIC); /* No allocation possible so we will abort */ if (!skb_out) goto alloc_failed; ctx->tx_low_mem_val--; } if (ctx->is_ndp16) { /* fill out the initial 16-bit NTB header */ nth.nth16 = skb_put_zero(skb_out, sizeof(struct usb_cdc_ncm_nth16)); nth.nth16->dwSignature = cpu_to_le32(USB_CDC_NCM_NTH16_SIGN); nth.nth16->wHeaderLength = cpu_to_le16(sizeof(struct usb_cdc_ncm_nth16)); nth.nth16->wSequence = cpu_to_le16(ctx->tx_seq++); } else { /* fill out the initial 32-bit NTB header */ nth.nth32 = skb_put_zero(skb_out, sizeof(struct usb_cdc_ncm_nth32)); nth.nth32->dwSignature = cpu_to_le32(USB_CDC_NCM_NTH32_SIGN); nth.nth32->wHeaderLength = cpu_to_le16(sizeof(struct usb_cdc_ncm_nth32)); nth.nth32->wSequence = cpu_to_le16(ctx->tx_seq++); } /* count total number of frames in this NTB */ ctx->tx_curr_frame_num = 0; /* recent payload counter for this skb_out */ ctx->tx_curr_frame_payload = 0; } for (n = ctx->tx_curr_frame_num; n < ctx->tx_max_datagrams; n++) { /* send any remaining skb first */ if (skb == NULL) { skb = ctx->tx_rem_skb; sign = ctx->tx_rem_sign; ctx->tx_rem_skb = NULL; /* check for end of skb */ if (skb == NULL) break; } /* get the appropriate NDP for this skb */ if (ctx->is_ndp16) ndp.ndp16 = cdc_ncm_ndp16(ctx, skb_out, sign, skb->len + ctx->tx_modulus + ctx->tx_remainder); else ndp.ndp32 = cdc_ncm_ndp32(ctx, skb_out, sign, skb->len + ctx->tx_modulus + ctx->tx_remainder); /* align beginning of next frame */ cdc_ncm_align_tail(skb_out, ctx->tx_modulus, ctx->tx_remainder, ctx->tx_curr_size); /* check if we had enough room left for both NDP and frame */ if ((ctx->is_ndp16 && !ndp.ndp16) || (!ctx->is_ndp16 && !ndp.ndp32) || skb_out->len + skb->len + delayed_ndp_size > ctx->tx_curr_size) { if (n == 0) { /* won't fit, MTU problem? */ dev_kfree_skb_any(skb); skb = NULL; dev->net->stats.tx_dropped++; } else { /* no room for skb - store for later */ if (ctx->tx_rem_skb != NULL) { dev_kfree_skb_any(ctx->tx_rem_skb); dev->net->stats.tx_dropped++; } ctx->tx_rem_skb = skb; ctx->tx_rem_sign = sign; skb = NULL; ready2send = 1; ctx->tx_reason_ntb_full++; /* count reason for transmitting */ } break; } /* calculate frame number within this NDP */ if (ctx->is_ndp16) { ndplen = le16_to_cpu(ndp.ndp16->wLength); index = (ndplen - sizeof(struct usb_cdc_ncm_ndp16)) / sizeof(struct usb_cdc_ncm_dpe16) - 1; /* OK, add this skb */ ndp.ndp16->dpe16[index].wDatagramLength = cpu_to_le16(skb->len); ndp.ndp16->dpe16[index].wDatagramIndex = cpu_to_le16(skb_out->len); ndp.ndp16->wLength = cpu_to_le16(ndplen + sizeof(struct usb_cdc_ncm_dpe16)); } else { ndplen = le16_to_cpu(ndp.ndp32->wLength); index = (ndplen - sizeof(struct usb_cdc_ncm_ndp32)) / sizeof(struct usb_cdc_ncm_dpe32) - 1; ndp.ndp32->dpe32[index].dwDatagramLength = cpu_to_le32(skb->len); ndp.ndp32->dpe32[index].dwDatagramIndex = cpu_to_le32(skb_out->len); ndp.ndp32->wLength = cpu_to_le16(ndplen + sizeof(struct usb_cdc_ncm_dpe32)); } skb_put_data(skb_out, skb->data, skb->len); ctx->tx_curr_frame_payload += skb->len; /* count real tx payload data */ dev_kfree_skb_any(skb); skb = NULL; /* send now if this NDP is full */ if (index >= CDC_NCM_DPT_DATAGRAMS_MAX) { ready2send = 1; ctx->tx_reason_ndp_full++; /* count reason for transmitting */ break; } } /* free up any dangling skb */ if (skb != NULL) { dev_kfree_skb_any(skb); skb = NULL; dev->net->stats.tx_dropped++; } ctx->tx_curr_frame_num = n; if (n == 0) { /* wait for more frames */ /* push variables */ ctx->tx_curr_skb = skb_out; goto exit_no_skb; } else if ((n < ctx->tx_max_datagrams) && (ready2send == 0) && (ctx->timer_interval > 0)) { /* wait for more frames */ /* push variables */ ctx->tx_curr_skb = skb_out; /* set the pending count */ if (n < CDC_NCM_RESTART_TIMER_DATAGRAM_CNT) ctx->tx_timer_pending = CDC_NCM_TIMER_PENDING_CNT; goto exit_no_skb; } else { if (n == ctx->tx_max_datagrams) ctx->tx_reason_max_datagram++; /* count reason for transmitting */ /* frame goes out */ /* variables will be reset at next call */ } /* If requested, put NDP at end of frame. */ if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END) { if (ctx->is_ndp16) { nth.nth16 = (struct usb_cdc_ncm_nth16 *)skb_out->data; cdc_ncm_align_tail(skb_out, ctx->tx_ndp_modulus, 0, ctx->tx_curr_size - ctx->max_ndp_size); nth.nth16->wNdpIndex = cpu_to_le16(skb_out->len); skb_put_data(skb_out, ctx->delayed_ndp16, ctx->max_ndp_size); /* Zero out delayed NDP - signature checking will naturally fail. */ ndp.ndp16 = memset(ctx->delayed_ndp16, 0, ctx->max_ndp_size); } else { nth.nth32 = (struct usb_cdc_ncm_nth32 *)skb_out->data; cdc_ncm_align_tail(skb_out, ctx->tx_ndp_modulus, 0, ctx->tx_curr_size - ctx->max_ndp_size); nth.nth32->dwNdpIndex = cpu_to_le32(skb_out->len); skb_put_data(skb_out, ctx->delayed_ndp32, ctx->max_ndp_size); ndp.ndp32 = memset(ctx->delayed_ndp32, 0, ctx->max_ndp_size); } } /* If collected data size is less or equal ctx->min_tx_pkt * bytes, we send buffers as it is. If we get more data, it * would be more efficient for USB HS mobile device with DMA * engine to receive a full size NTB, than canceling DMA * transfer and receiving a short packet. * * This optimization support is pointless if we end up sending * a ZLP after full sized NTBs. */ if (!(dev->driver_info->flags & FLAG_SEND_ZLP) && skb_out->len > ctx->min_tx_pkt) { padding_count = ctx->tx_curr_size - skb_out->len; if (!WARN_ON(padding_count > ctx->tx_curr_size)) skb_put_zero(skb_out, padding_count); } else if (skb_out->len < ctx->tx_curr_size && (skb_out->len % dev->maxpacket) == 0) { skb_put_u8(skb_out, 0); /* force short packet */ } /* set final frame length */ if (ctx->is_ndp16) { nth.nth16 = (struct usb_cdc_ncm_nth16 *)skb_out->data; nth.nth16->wBlockLength = cpu_to_le16(skb_out->len); } else { nth.nth32 = (struct usb_cdc_ncm_nth32 *)skb_out->data; nth.nth32->dwBlockLength = cpu_to_le32(skb_out->len); } /* return skb */ ctx->tx_curr_skb = NULL; /* keep private stats: framing overhead and number of NTBs */ ctx->tx_overhead += skb_out->len - ctx->tx_curr_frame_payload; ctx->tx_ntbs++; /* usbnet will count all the framing overhead by default. * Adjust the stats so that the tx_bytes counter show real * payload data instead. */ usbnet_set_skb_tx_stats(skb_out, n, (long)ctx->tx_curr_frame_payload - skb_out->len); return skb_out; alloc_failed: if (skb) { dev_kfree_skb_any(skb); dev->net->stats.tx_dropped++; } exit_no_skb: /* Start timer, if there is a remaining non-empty skb */ if (ctx->tx_curr_skb != NULL && n > 0) cdc_ncm_tx_timeout_start(ctx); return NULL; } EXPORT_SYMBOL_GPL(cdc_ncm_fill_tx_frame); static void cdc_ncm_tx_timeout_start(struct cdc_ncm_ctx *ctx) { /* start timer, if not already started */ if (!(hrtimer_active(&ctx->tx_timer) || atomic_read(&ctx->stop))) hrtimer_start(&ctx->tx_timer, ctx->timer_interval, HRTIMER_MODE_REL); } static enum hrtimer_restart cdc_ncm_tx_timer_cb(struct hrtimer *timer) { struct cdc_ncm_ctx *ctx = container_of(timer, struct cdc_ncm_ctx, tx_timer); if (!atomic_read(&ctx->stop)) tasklet_schedule(&ctx->bh); return HRTIMER_NORESTART; } static void cdc_ncm_txpath_bh(struct tasklet_struct *t) { struct cdc_ncm_ctx *ctx = from_tasklet(ctx, t, bh); struct usbnet *dev = ctx->dev; spin_lock(&ctx->mtx); if (ctx->tx_timer_pending != 0) { ctx->tx_timer_pending--; cdc_ncm_tx_timeout_start(ctx); spin_unlock(&ctx->mtx); } else if (dev->net != NULL) { ctx->tx_reason_timeout++; /* count reason for transmitting */ spin_unlock(&ctx->mtx); netif_tx_lock_bh(dev->net); usbnet_start_xmit(NULL, dev->net); netif_tx_unlock_bh(dev->net); } else { spin_unlock(&ctx->mtx); } } struct sk_buff * cdc_ncm_tx_fixup(struct usbnet *dev, struct sk_buff *skb, gfp_t flags) { struct sk_buff *skb_out; struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; /* * The Ethernet API we are using does not support transmitting * multiple Ethernet frames in a single call. This driver will * accumulate multiple Ethernet frames and send out a larger * USB frame when the USB buffer is full or when a single jiffies * timeout happens. */ if (ctx == NULL) goto error; spin_lock_bh(&ctx->mtx); if (ctx->is_ndp16) skb_out = cdc_ncm_fill_tx_frame(dev, skb, cpu_to_le32(USB_CDC_NCM_NDP16_NOCRC_SIGN)); else skb_out = cdc_ncm_fill_tx_frame(dev, skb, cpu_to_le32(USB_CDC_NCM_NDP32_NOCRC_SIGN)); spin_unlock_bh(&ctx->mtx); return skb_out; error: if (skb != NULL) dev_kfree_skb_any(skb); return NULL; } EXPORT_SYMBOL_GPL(cdc_ncm_tx_fixup); /* verify NTB header and return offset of first NDP, or negative error */ int cdc_ncm_rx_verify_nth16(struct cdc_ncm_ctx *ctx, struct sk_buff *skb_in) { struct usbnet *dev = netdev_priv(skb_in->dev); struct usb_cdc_ncm_nth16 *nth16; int len; int ret = -EINVAL; if (ctx == NULL) goto error; if (skb_in->len < (sizeof(struct usb_cdc_ncm_nth16) + sizeof(struct usb_cdc_ncm_ndp16))) { netif_dbg(dev, rx_err, dev->net, "frame too short\n"); goto error; } nth16 = (struct usb_cdc_ncm_nth16 *)skb_in->data; if (nth16->dwSignature != cpu_to_le32(USB_CDC_NCM_NTH16_SIGN)) { netif_dbg(dev, rx_err, dev->net, "invalid NTH16 signature <%#010x>\n", le32_to_cpu(nth16->dwSignature)); goto error; } len = le16_to_cpu(nth16->wBlockLength); if (len > ctx->rx_max) { netif_dbg(dev, rx_err, dev->net, "unsupported NTB block length %u/%u\n", len, ctx->rx_max); goto error; } if ((ctx->rx_seq + 1) != le16_to_cpu(nth16->wSequence) && (ctx->rx_seq || le16_to_cpu(nth16->wSequence)) && !((ctx->rx_seq == 0xffff) && !le16_to_cpu(nth16->wSequence))) { netif_dbg(dev, rx_err, dev->net, "sequence number glitch prev=%d curr=%d\n", ctx->rx_seq, le16_to_cpu(nth16->wSequence)); } ctx->rx_seq = le16_to_cpu(nth16->wSequence); ret = le16_to_cpu(nth16->wNdpIndex); error: return ret; } EXPORT_SYMBOL_GPL(cdc_ncm_rx_verify_nth16); int cdc_ncm_rx_verify_nth32(struct cdc_ncm_ctx *ctx, struct sk_buff *skb_in) { struct usbnet *dev = netdev_priv(skb_in->dev); struct usb_cdc_ncm_nth32 *nth32; int len; int ret = -EINVAL; if (ctx == NULL) goto error; if (skb_in->len < (sizeof(struct usb_cdc_ncm_nth32) + sizeof(struct usb_cdc_ncm_ndp32))) { netif_dbg(dev, rx_err, dev->net, "frame too short\n"); goto error; } nth32 = (struct usb_cdc_ncm_nth32 *)skb_in->data; if (nth32->dwSignature != cpu_to_le32(USB_CDC_NCM_NTH32_SIGN)) { netif_dbg(dev, rx_err, dev->net, "invalid NTH32 signature <%#010x>\n", le32_to_cpu(nth32->dwSignature)); goto error; } len = le32_to_cpu(nth32->dwBlockLength); if (len > ctx->rx_max) { netif_dbg(dev, rx_err, dev->net, "unsupported NTB block length %u/%u\n", len, ctx->rx_max); goto error; } if ((ctx->rx_seq + 1) != le16_to_cpu(nth32->wSequence) && (ctx->rx_seq || le16_to_cpu(nth32->wSequence)) && !((ctx->rx_seq == 0xffff) && !le16_to_cpu(nth32->wSequence))) { netif_dbg(dev, rx_err, dev->net, "sequence number glitch prev=%d curr=%d\n", ctx->rx_seq, le16_to_cpu(nth32->wSequence)); } ctx->rx_seq = le16_to_cpu(nth32->wSequence); ret = le32_to_cpu(nth32->dwNdpIndex); error: return ret; } EXPORT_SYMBOL_GPL(cdc_ncm_rx_verify_nth32); /* verify NDP header and return number of datagrams, or negative error */ int cdc_ncm_rx_verify_ndp16(struct sk_buff *skb_in, int ndpoffset) { struct usbnet *dev = netdev_priv(skb_in->dev); struct usb_cdc_ncm_ndp16 *ndp16; int ret = -EINVAL; if ((ndpoffset + sizeof(struct usb_cdc_ncm_ndp16)) > skb_in->len) { netif_dbg(dev, rx_err, dev->net, "invalid NDP offset <%u>\n", ndpoffset); goto error; } ndp16 = (struct usb_cdc_ncm_ndp16 *)(skb_in->data + ndpoffset); if (le16_to_cpu(ndp16->wLength) < USB_CDC_NCM_NDP16_LENGTH_MIN) { netif_dbg(dev, rx_err, dev->net, "invalid DPT16 length <%u>\n", le16_to_cpu(ndp16->wLength)); goto error; } ret = ((le16_to_cpu(ndp16->wLength) - sizeof(struct usb_cdc_ncm_ndp16)) / sizeof(struct usb_cdc_ncm_dpe16)); ret--; /* we process NDP entries except for the last one */ if ((sizeof(struct usb_cdc_ncm_ndp16) + ret * (sizeof(struct usb_cdc_ncm_dpe16))) > skb_in->len) { netif_dbg(dev, rx_err, dev->net, "Invalid nframes = %d\n", ret); ret = -EINVAL; } error: return ret; } EXPORT_SYMBOL_GPL(cdc_ncm_rx_verify_ndp16); /* verify NDP header and return number of datagrams, or negative error */ int cdc_ncm_rx_verify_ndp32(struct sk_buff *skb_in, int ndpoffset) { struct usbnet *dev = netdev_priv(skb_in->dev); struct usb_cdc_ncm_ndp32 *ndp32; int ret = -EINVAL; if ((ndpoffset + sizeof(struct usb_cdc_ncm_ndp32)) > skb_in->len) { netif_dbg(dev, rx_err, dev->net, "invalid NDP offset <%u>\n", ndpoffset); goto error; } ndp32 = (struct usb_cdc_ncm_ndp32 *)(skb_in->data + ndpoffset); if (le16_to_cpu(ndp32->wLength) < USB_CDC_NCM_NDP32_LENGTH_MIN) { netif_dbg(dev, rx_err, dev->net, "invalid DPT32 length <%u>\n", le16_to_cpu(ndp32->wLength)); goto error; } ret = ((le16_to_cpu(ndp32->wLength) - sizeof(struct usb_cdc_ncm_ndp32)) / sizeof(struct usb_cdc_ncm_dpe32)); ret--; /* we process NDP entries except for the last one */ if ((sizeof(struct usb_cdc_ncm_ndp32) + ret * (sizeof(struct usb_cdc_ncm_dpe32))) > skb_in->len) { netif_dbg(dev, rx_err, dev->net, "Invalid nframes = %d\n", ret); ret = -EINVAL; } error: return ret; } EXPORT_SYMBOL_GPL(cdc_ncm_rx_verify_ndp32); int cdc_ncm_rx_fixup(struct usbnet *dev, struct sk_buff *skb_in) { struct sk_buff *skb; struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; unsigned int len; int nframes; int x; unsigned int offset; union { struct usb_cdc_ncm_ndp16 *ndp16; struct usb_cdc_ncm_ndp32 *ndp32; } ndp; union { struct usb_cdc_ncm_dpe16 *dpe16; struct usb_cdc_ncm_dpe32 *dpe32; } dpe; int ndpoffset; int loopcount = 50; /* arbitrary max preventing infinite loop */ u32 payload = 0; if (ctx->is_ndp16) ndpoffset = cdc_ncm_rx_verify_nth16(ctx, skb_in); else ndpoffset = cdc_ncm_rx_verify_nth32(ctx, skb_in); if (ndpoffset < 0) goto error; next_ndp: if (ctx->is_ndp16) { nframes = cdc_ncm_rx_verify_ndp16(skb_in, ndpoffset); if (nframes < 0) goto error; ndp.ndp16 = (struct usb_cdc_ncm_ndp16 *)(skb_in->data + ndpoffset); if (ndp.ndp16->dwSignature != cpu_to_le32(USB_CDC_NCM_NDP16_NOCRC_SIGN)) { netif_dbg(dev, rx_err, dev->net, "invalid DPT16 signature <%#010x>\n", le32_to_cpu(ndp.ndp16->dwSignature)); goto err_ndp; } dpe.dpe16 = ndp.ndp16->dpe16; } else { nframes = cdc_ncm_rx_verify_ndp32(skb_in, ndpoffset); if (nframes < 0) goto error; ndp.ndp32 = (struct usb_cdc_ncm_ndp32 *)(skb_in->data + ndpoffset); if (ndp.ndp32->dwSignature != cpu_to_le32(USB_CDC_NCM_NDP32_NOCRC_SIGN)) { netif_dbg(dev, rx_err, dev->net, "invalid DPT32 signature <%#010x>\n", le32_to_cpu(ndp.ndp32->dwSignature)); goto err_ndp; } dpe.dpe32 = ndp.ndp32->dpe32; } for (x = 0; x < nframes; x++) { if (ctx->is_ndp16) { offset = le16_to_cpu(dpe.dpe16->wDatagramIndex); len = le16_to_cpu(dpe.dpe16->wDatagramLength); } else { offset = le32_to_cpu(dpe.dpe32->dwDatagramIndex); len = le32_to_cpu(dpe.dpe32->dwDatagramLength); } /* * CDC NCM ch. 3.7 * All entries after first NULL entry are to be ignored */ if ((offset == 0) || (len == 0)) { if (!x) goto err_ndp; /* empty NTB */ break; } /* sanity checking - watch out for integer wrap*/ if ((offset > skb_in->len) || (len > skb_in->len - offset) || (len > ctx->rx_max) || (len < ETH_HLEN)) { netif_dbg(dev, rx_err, dev->net, "invalid frame detected (ignored) offset[%u]=%u, length=%u, skb=%p\n", x, offset, len, skb_in); if (!x) goto err_ndp; break; } else { /* create a fresh copy to reduce truesize */ skb = netdev_alloc_skb_ip_align(dev->net, len); if (!skb) goto error; skb_put_data(skb, skb_in->data + offset, len); usbnet_skb_return(dev, skb); payload += len; /* count payload bytes in this NTB */ } if (ctx->is_ndp16) dpe.dpe16++; else dpe.dpe32++; } err_ndp: /* are there more NDPs to process? */ if (ctx->is_ndp16) ndpoffset = le16_to_cpu(ndp.ndp16->wNextNdpIndex); else ndpoffset = le32_to_cpu(ndp.ndp32->dwNextNdpIndex); if (ndpoffset && loopcount--) goto next_ndp; /* update stats */ ctx->rx_overhead += skb_in->len - payload; ctx->rx_ntbs++; return 1; error: return 0; } EXPORT_SYMBOL_GPL(cdc_ncm_rx_fixup); static void cdc_ncm_speed_change(struct usbnet *dev, struct usb_cdc_speed_change *data) { /* RTL8156 shipped before 2021 sends notification about every 32ms. */ dev->rx_speed = le32_to_cpu(data->DLBitRRate); dev->tx_speed = le32_to_cpu(data->ULBitRate); } static void cdc_ncm_status(struct usbnet *dev, struct urb *urb) { struct usb_cdc_notification *event; if (urb->actual_length < sizeof(*event)) return; /* test for split data in 8-byte chunks */ if (test_and_clear_bit(EVENT_STS_SPLIT, &dev->flags)) { cdc_ncm_speed_change(dev, (struct usb_cdc_speed_change *)urb->transfer_buffer); return; } event = urb->transfer_buffer; switch (event->bNotificationType) { case USB_CDC_NOTIFY_NETWORK_CONNECTION: /* * According to the CDC NCM specification ch.7.1 * USB_CDC_NOTIFY_NETWORK_CONNECTION notification shall be * sent by device after USB_CDC_NOTIFY_SPEED_CHANGE. */ /* RTL8156 shipped before 2021 sends notification about * every 32ms. Don't forward notification if state is same. */ if (netif_carrier_ok(dev->net) != !!event->wValue) usbnet_link_change(dev, !!event->wValue, 0); break; case USB_CDC_NOTIFY_SPEED_CHANGE: if (urb->actual_length < (sizeof(*event) + sizeof(struct usb_cdc_speed_change))) set_bit(EVENT_STS_SPLIT, &dev->flags); else cdc_ncm_speed_change(dev, (struct usb_cdc_speed_change *)&event[1]); break; default: dev_dbg(&dev->udev->dev, "NCM: unexpected notification 0x%02x!\n", event->bNotificationType); break; } } static const struct driver_info cdc_ncm_info = { .description = "CDC NCM (NO ZLP)", .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET | FLAG_LINK_INTR | FLAG_ETHER, .bind = cdc_ncm_bind, .unbind = cdc_ncm_unbind, .manage_power = usbnet_manage_power, .status = cdc_ncm_status, .rx_fixup = cdc_ncm_rx_fixup, .tx_fixup = cdc_ncm_tx_fixup, .set_rx_mode = usbnet_cdc_update_filter, }; /* Same as cdc_ncm_info, but with FLAG_SEND_ZLP */ static const struct driver_info cdc_ncm_zlp_info = { .description = "CDC NCM (SEND ZLP)", .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET | FLAG_LINK_INTR | FLAG_ETHER | FLAG_SEND_ZLP, .bind = cdc_ncm_bind, .unbind = cdc_ncm_unbind, .manage_power = usbnet_manage_power, .status = cdc_ncm_status, .rx_fixup = cdc_ncm_rx_fixup, .tx_fixup = cdc_ncm_tx_fixup, .set_rx_mode = usbnet_cdc_update_filter, }; /* Same as cdc_ncm_info, but with FLAG_WWAN */ static const struct driver_info wwan_info = { .description = "Mobile Broadband Network Device", .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET | FLAG_LINK_INTR | FLAG_WWAN, .bind = cdc_ncm_bind, .unbind = cdc_ncm_unbind, .manage_power = usbnet_manage_power, .status = cdc_ncm_status, .rx_fixup = cdc_ncm_rx_fixup, .tx_fixup = cdc_ncm_tx_fixup, .set_rx_mode = usbnet_cdc_update_filter, }; /* Same as wwan_info, but with FLAG_NOARP */ static const struct driver_info wwan_noarp_info = { .description = "Mobile Broadband Network Device (NO ARP)", .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET | FLAG_LINK_INTR | FLAG_WWAN | FLAG_NOARP, .bind = cdc_ncm_bind, .unbind = cdc_ncm_unbind, .manage_power = usbnet_manage_power, .status = cdc_ncm_status, .rx_fixup = cdc_ncm_rx_fixup, .tx_fixup = cdc_ncm_tx_fixup, .set_rx_mode = usbnet_cdc_update_filter, }; static const struct usb_device_id cdc_devs[] = { /* Ericsson MBM devices like F5521gw */ { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_VENDOR, .idVendor = 0x0bdb, .bInterfaceClass = USB_CLASS_COMM, .bInterfaceSubClass = USB_CDC_SUBCLASS_NCM, .bInterfaceProtocol = USB_CDC_PROTO_NONE, .driver_info = (unsigned long) &wwan_info, }, /* Telit LE910 V2 */ { USB_DEVICE_AND_INTERFACE_INFO(0x1bc7, 0x0036, USB_CLASS_COMM, USB_CDC_SUBCLASS_NCM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&wwan_noarp_info, }, /* DW5812 LTE Verizon Mobile Broadband Card * Unlike DW5550 this device requires FLAG_NOARP */ { USB_DEVICE_AND_INTERFACE_INFO(0x413c, 0x81bb, USB_CLASS_COMM, USB_CDC_SUBCLASS_NCM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&wwan_noarp_info, }, /* DW5813 LTE AT&T Mobile Broadband Card * Unlike DW5550 this device requires FLAG_NOARP */ { USB_DEVICE_AND_INTERFACE_INFO(0x413c, 0x81bc, USB_CLASS_COMM, USB_CDC_SUBCLASS_NCM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&wwan_noarp_info, }, /* Dell branded MBM devices like DW5550 */ { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_VENDOR, .idVendor = 0x413c, .bInterfaceClass = USB_CLASS_COMM, .bInterfaceSubClass = USB_CDC_SUBCLASS_NCM, .bInterfaceProtocol = USB_CDC_PROTO_NONE, .driver_info = (unsigned long) &wwan_info, }, /* Toshiba branded MBM devices */ { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_VENDOR, .idVendor = 0x0930, .bInterfaceClass = USB_CLASS_COMM, .bInterfaceSubClass = USB_CDC_SUBCLASS_NCM, .bInterfaceProtocol = USB_CDC_PROTO_NONE, .driver_info = (unsigned long) &wwan_info, }, /* tag Huawei devices as wwan */ { USB_VENDOR_AND_INTERFACE_INFO(0x12d1, USB_CLASS_COMM, USB_CDC_SUBCLASS_NCM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&wwan_info, }, /* Infineon(now Intel) HSPA Modem platform */ { USB_DEVICE_AND_INTERFACE_INFO(0x1519, 0x0443, USB_CLASS_COMM, USB_CDC_SUBCLASS_NCM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&wwan_noarp_info, }, /* u-blox TOBY-L4 */ { USB_DEVICE_AND_INTERFACE_INFO(0x1546, 0x1010, USB_CLASS_COMM, USB_CDC_SUBCLASS_NCM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&wwan_info, }, /* DisplayLink docking stations */ { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_VENDOR, .idVendor = 0x17e9, .bInterfaceClass = USB_CLASS_COMM, .bInterfaceSubClass = USB_CDC_SUBCLASS_NCM, .bInterfaceProtocol = USB_CDC_PROTO_NONE, .driver_info = (unsigned long)&cdc_ncm_zlp_info, }, /* Generic CDC-NCM devices */ { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_NCM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&cdc_ncm_info, }, { }, }; MODULE_DEVICE_TABLE(usb, cdc_devs); static struct usb_driver cdc_ncm_driver = { .name = "cdc_ncm", .id_table = cdc_devs, .probe = usbnet_probe, .disconnect = usbnet_disconnect, .suspend = usbnet_suspend, .resume = usbnet_resume, .reset_resume = usbnet_resume, .supports_autosuspend = 1, .disable_hub_initiated_lpm = 1, }; module_usb_driver(cdc_ncm_driver); MODULE_AUTHOR("Hans Petter Selasky"); MODULE_DESCRIPTION("USB CDC NCM host driver"); MODULE_LICENSE("Dual BSD/GPL"); |
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 | // SPDX-License-Identifier: GPL-2.0-only /* * Sysfs interface for the universal power supply monitor class * * Copyright © 2007 David Woodhouse <dwmw2@infradead.org> * Copyright © 2007 Anton Vorontsov <cbou@mail.ru> * Copyright © 2004 Szabolcs Gyurko * Copyright © 2003 Ian Molton <spyro@f2s.com> * * Modified: 2004, Oct Szabolcs Gyurko */ #include <linux/ctype.h> #include <linux/device.h> #include <linux/power_supply.h> #include <linux/slab.h> #include <linux/stat.h> #include "power_supply.h" #define MAX_PROP_NAME_LEN 30 struct power_supply_attr { const char *prop_name; char attr_name[MAX_PROP_NAME_LEN + 1]; struct device_attribute dev_attr; const char * const *text_values; int text_values_len; }; #define _POWER_SUPPLY_ATTR(_name, _text, _len) \ [POWER_SUPPLY_PROP_ ## _name] = \ { \ .prop_name = #_name, \ .attr_name = #_name "\0", \ .text_values = _text, \ .text_values_len = _len, \ } #define POWER_SUPPLY_ATTR(_name) _POWER_SUPPLY_ATTR(_name, NULL, 0) #define _POWER_SUPPLY_ENUM_ATTR(_name, _text) \ _POWER_SUPPLY_ATTR(_name, _text, ARRAY_SIZE(_text)) #define POWER_SUPPLY_ENUM_ATTR(_name) \ _POWER_SUPPLY_ENUM_ATTR(_name, POWER_SUPPLY_ ## _name ## _TEXT) static const char * const POWER_SUPPLY_TYPE_TEXT[] = { [POWER_SUPPLY_TYPE_UNKNOWN] = "Unknown", [POWER_SUPPLY_TYPE_BATTERY] = "Battery", [POWER_SUPPLY_TYPE_UPS] = "UPS", [POWER_SUPPLY_TYPE_MAINS] = "Mains", [POWER_SUPPLY_TYPE_USB] = "USB", [POWER_SUPPLY_TYPE_USB_DCP] = "USB_DCP", [POWER_SUPPLY_TYPE_USB_CDP] = "USB_CDP", [POWER_SUPPLY_TYPE_USB_ACA] = "USB_ACA", [POWER_SUPPLY_TYPE_USB_TYPE_C] = "USB_C", [POWER_SUPPLY_TYPE_USB_PD] = "USB_PD", [POWER_SUPPLY_TYPE_USB_PD_DRP] = "USB_PD_DRP", [POWER_SUPPLY_TYPE_APPLE_BRICK_ID] = "BrickID", [POWER_SUPPLY_TYPE_WIRELESS] = "Wireless", }; static const char * const POWER_SUPPLY_USB_TYPE_TEXT[] = { [POWER_SUPPLY_USB_TYPE_UNKNOWN] = "Unknown", [POWER_SUPPLY_USB_TYPE_SDP] = "SDP", [POWER_SUPPLY_USB_TYPE_DCP] = "DCP", [POWER_SUPPLY_USB_TYPE_CDP] = "CDP", [POWER_SUPPLY_USB_TYPE_ACA] = "ACA", [POWER_SUPPLY_USB_TYPE_C] = "C", [POWER_SUPPLY_USB_TYPE_PD] = "PD", [POWER_SUPPLY_USB_TYPE_PD_DRP] = "PD_DRP", [POWER_SUPPLY_USB_TYPE_PD_PPS] = "PD_PPS", [POWER_SUPPLY_USB_TYPE_APPLE_BRICK_ID] = "BrickID", }; static const char * const POWER_SUPPLY_STATUS_TEXT[] = { [POWER_SUPPLY_STATUS_UNKNOWN] = "Unknown", [POWER_SUPPLY_STATUS_CHARGING] = "Charging", [POWER_SUPPLY_STATUS_DISCHARGING] = "Discharging", [POWER_SUPPLY_STATUS_NOT_CHARGING] = "Not charging", [POWER_SUPPLY_STATUS_FULL] = "Full", }; static const char * const POWER_SUPPLY_CHARGE_TYPE_TEXT[] = { [POWER_SUPPLY_CHARGE_TYPE_UNKNOWN] = "Unknown", [POWER_SUPPLY_CHARGE_TYPE_NONE] = "N/A", [POWER_SUPPLY_CHARGE_TYPE_TRICKLE] = "Trickle", [POWER_SUPPLY_CHARGE_TYPE_FAST] = "Fast", [POWER_SUPPLY_CHARGE_TYPE_STANDARD] = "Standard", [POWER_SUPPLY_CHARGE_TYPE_ADAPTIVE] = "Adaptive", [POWER_SUPPLY_CHARGE_TYPE_CUSTOM] = "Custom", [POWER_SUPPLY_CHARGE_TYPE_LONGLIFE] = "Long Life", [POWER_SUPPLY_CHARGE_TYPE_BYPASS] = "Bypass", [POWER_SUPPLY_CHARGE_TYPE_TAPER_EXT] = "Taper", }; static const char * const POWER_SUPPLY_HEALTH_TEXT[] = { [POWER_SUPPLY_HEALTH_UNKNOWN] = "Unknown", [POWER_SUPPLY_HEALTH_GOOD] = "Good", [POWER_SUPPLY_HEALTH_OVERHEAT] = "Overheat", [POWER_SUPPLY_HEALTH_DEAD] = "Dead", [POWER_SUPPLY_HEALTH_OVERVOLTAGE] = "Over voltage", [POWER_SUPPLY_HEALTH_UNSPEC_FAILURE] = "Unspecified failure", [POWER_SUPPLY_HEALTH_COLD] = "Cold", [POWER_SUPPLY_HEALTH_WATCHDOG_TIMER_EXPIRE] = "Watchdog timer expire", [POWER_SUPPLY_HEALTH_SAFETY_TIMER_EXPIRE] = "Safety timer expire", [POWER_SUPPLY_HEALTH_OVERCURRENT] = "Over current", [POWER_SUPPLY_HEALTH_CALIBRATION_REQUIRED] = "Calibration required", [POWER_SUPPLY_HEALTH_WARM] = "Warm", [POWER_SUPPLY_HEALTH_COOL] = "Cool", [POWER_SUPPLY_HEALTH_HOT] = "Hot", [POWER_SUPPLY_HEALTH_NO_BATTERY] = "No battery", }; static const char * const POWER_SUPPLY_TECHNOLOGY_TEXT[] = { [POWER_SUPPLY_TECHNOLOGY_UNKNOWN] = "Unknown", [POWER_SUPPLY_TECHNOLOGY_NiMH] = "NiMH", [POWER_SUPPLY_TECHNOLOGY_LION] = "Li-ion", [POWER_SUPPLY_TECHNOLOGY_LIPO] = "Li-poly", [POWER_SUPPLY_TECHNOLOGY_LiFe] = "LiFe", [POWER_SUPPLY_TECHNOLOGY_NiCd] = "NiCd", [POWER_SUPPLY_TECHNOLOGY_LiMn] = "LiMn", }; static const char * const POWER_SUPPLY_CAPACITY_LEVEL_TEXT[] = { [POWER_SUPPLY_CAPACITY_LEVEL_UNKNOWN] = "Unknown", [POWER_SUPPLY_CAPACITY_LEVEL_CRITICAL] = "Critical", [POWER_SUPPLY_CAPACITY_LEVEL_LOW] = "Low", [POWER_SUPPLY_CAPACITY_LEVEL_NORMAL] = "Normal", [POWER_SUPPLY_CAPACITY_LEVEL_HIGH] = "High", [POWER_SUPPLY_CAPACITY_LEVEL_FULL] = "Full", }; static const char * const POWER_SUPPLY_SCOPE_TEXT[] = { [POWER_SUPPLY_SCOPE_UNKNOWN] = "Unknown", [POWER_SUPPLY_SCOPE_SYSTEM] = "System", [POWER_SUPPLY_SCOPE_DEVICE] = "Device", }; static const char * const POWER_SUPPLY_CHARGE_BEHAVIOUR_TEXT[] = { [POWER_SUPPLY_CHARGE_BEHAVIOUR_AUTO] = "auto", [POWER_SUPPLY_CHARGE_BEHAVIOUR_INHIBIT_CHARGE] = "inhibit-charge", [POWER_SUPPLY_CHARGE_BEHAVIOUR_FORCE_DISCHARGE] = "force-discharge", }; static struct power_supply_attr power_supply_attrs[] = { /* Properties of type `int' */ POWER_SUPPLY_ENUM_ATTR(STATUS), POWER_SUPPLY_ENUM_ATTR(CHARGE_TYPE), POWER_SUPPLY_ENUM_ATTR(HEALTH), POWER_SUPPLY_ATTR(PRESENT), POWER_SUPPLY_ATTR(ONLINE), POWER_SUPPLY_ATTR(AUTHENTIC), POWER_SUPPLY_ENUM_ATTR(TECHNOLOGY), POWER_SUPPLY_ATTR(CYCLE_COUNT), POWER_SUPPLY_ATTR(VOLTAGE_MAX), POWER_SUPPLY_ATTR(VOLTAGE_MIN), POWER_SUPPLY_ATTR(VOLTAGE_MAX_DESIGN), POWER_SUPPLY_ATTR(VOLTAGE_MIN_DESIGN), POWER_SUPPLY_ATTR(VOLTAGE_NOW), POWER_SUPPLY_ATTR(VOLTAGE_AVG), POWER_SUPPLY_ATTR(VOLTAGE_OCV), POWER_SUPPLY_ATTR(VOLTAGE_BOOT), POWER_SUPPLY_ATTR(CURRENT_MAX), POWER_SUPPLY_ATTR(CURRENT_NOW), POWER_SUPPLY_ATTR(CURRENT_AVG), POWER_SUPPLY_ATTR(CURRENT_BOOT), POWER_SUPPLY_ATTR(POWER_NOW), POWER_SUPPLY_ATTR(POWER_AVG), POWER_SUPPLY_ATTR(CHARGE_FULL_DESIGN), POWER_SUPPLY_ATTR(CHARGE_EMPTY_DESIGN), POWER_SUPPLY_ATTR(CHARGE_FULL), POWER_SUPPLY_ATTR(CHARGE_EMPTY), POWER_SUPPLY_ATTR(CHARGE_NOW), POWER_SUPPLY_ATTR(CHARGE_AVG), POWER_SUPPLY_ATTR(CHARGE_COUNTER), POWER_SUPPLY_ATTR(CONSTANT_CHARGE_CURRENT), POWER_SUPPLY_ATTR(CONSTANT_CHARGE_CURRENT_MAX), POWER_SUPPLY_ATTR(CONSTANT_CHARGE_VOLTAGE), POWER_SUPPLY_ATTR(CONSTANT_CHARGE_VOLTAGE_MAX), POWER_SUPPLY_ATTR(CHARGE_CONTROL_LIMIT), POWER_SUPPLY_ATTR(CHARGE_CONTROL_LIMIT_MAX), POWER_SUPPLY_ATTR(CHARGE_CONTROL_START_THRESHOLD), POWER_SUPPLY_ATTR(CHARGE_CONTROL_END_THRESHOLD), POWER_SUPPLY_ENUM_ATTR(CHARGE_BEHAVIOUR), POWER_SUPPLY_ATTR(INPUT_CURRENT_LIMIT), POWER_SUPPLY_ATTR(INPUT_VOLTAGE_LIMIT), POWER_SUPPLY_ATTR(INPUT_POWER_LIMIT), POWER_SUPPLY_ATTR(ENERGY_FULL_DESIGN), POWER_SUPPLY_ATTR(ENERGY_EMPTY_DESIGN), POWER_SUPPLY_ATTR(ENERGY_FULL), POWER_SUPPLY_ATTR(ENERGY_EMPTY), POWER_SUPPLY_ATTR(ENERGY_NOW), POWER_SUPPLY_ATTR(ENERGY_AVG), POWER_SUPPLY_ATTR(CAPACITY), POWER_SUPPLY_ATTR(CAPACITY_ALERT_MIN), POWER_SUPPLY_ATTR(CAPACITY_ALERT_MAX), POWER_SUPPLY_ATTR(CAPACITY_ERROR_MARGIN), POWER_SUPPLY_ENUM_ATTR(CAPACITY_LEVEL), POWER_SUPPLY_ATTR(TEMP), POWER_SUPPLY_ATTR(TEMP_MAX), POWER_SUPPLY_ATTR(TEMP_MIN), POWER_SUPPLY_ATTR(TEMP_ALERT_MIN), POWER_SUPPLY_ATTR(TEMP_ALERT_MAX), POWER_SUPPLY_ATTR(TEMP_AMBIENT), POWER_SUPPLY_ATTR(TEMP_AMBIENT_ALERT_MIN), POWER_SUPPLY_ATTR(TEMP_AMBIENT_ALERT_MAX), POWER_SUPPLY_ATTR(TIME_TO_EMPTY_NOW), POWER_SUPPLY_ATTR(TIME_TO_EMPTY_AVG), POWER_SUPPLY_ATTR(TIME_TO_FULL_NOW), POWER_SUPPLY_ATTR(TIME_TO_FULL_AVG), POWER_SUPPLY_ENUM_ATTR(TYPE), POWER_SUPPLY_ATTR(USB_TYPE), POWER_SUPPLY_ENUM_ATTR(SCOPE), POWER_SUPPLY_ATTR(PRECHARGE_CURRENT), POWER_SUPPLY_ATTR(CHARGE_TERM_CURRENT), POWER_SUPPLY_ATTR(CALIBRATE), POWER_SUPPLY_ATTR(MANUFACTURE_YEAR), POWER_SUPPLY_ATTR(MANUFACTURE_MONTH), POWER_SUPPLY_ATTR(MANUFACTURE_DAY), /* Properties of type `const char *' */ POWER_SUPPLY_ATTR(MODEL_NAME), POWER_SUPPLY_ATTR(MANUFACTURER), POWER_SUPPLY_ATTR(SERIAL_NUMBER), }; static struct attribute * __power_supply_attrs[ARRAY_SIZE(power_supply_attrs) + 1]; static struct power_supply_attr *to_ps_attr(struct device_attribute *attr) { return container_of(attr, struct power_supply_attr, dev_attr); } static enum power_supply_property dev_attr_psp(struct device_attribute *attr) { return to_ps_attr(attr) - power_supply_attrs; } static ssize_t power_supply_show_usb_type(struct device *dev, const struct power_supply_desc *desc, union power_supply_propval *value, char *buf) { enum power_supply_usb_type usb_type; ssize_t count = 0; bool match = false; int i; for (i = 0; i < desc->num_usb_types; ++i) { usb_type = desc->usb_types[i]; if (value->intval == usb_type) { count += sprintf(buf + count, "[%s] ", POWER_SUPPLY_USB_TYPE_TEXT[usb_type]); match = true; } else { count += sprintf(buf + count, "%s ", POWER_SUPPLY_USB_TYPE_TEXT[usb_type]); } } if (!match) { dev_warn(dev, "driver reporting unsupported connected type\n"); return -EINVAL; } if (count) buf[count - 1] = '\n'; return count; } static ssize_t power_supply_show_property(struct device *dev, struct device_attribute *attr, char *buf) { ssize_t ret; struct power_supply *psy = dev_get_drvdata(dev); struct power_supply_attr *ps_attr = to_ps_attr(attr); enum power_supply_property psp = dev_attr_psp(attr); union power_supply_propval value; if (psp == POWER_SUPPLY_PROP_TYPE) { value.intval = psy->desc->type; } else { ret = power_supply_get_property(psy, psp, &value); if (ret < 0) { if (ret == -ENODATA) dev_dbg_ratelimited(dev, "driver has no data for `%s' property\n", attr->attr.name); else if (ret != -ENODEV && ret != -EAGAIN) dev_err_ratelimited(dev, "driver failed to report `%s' property: %zd\n", attr->attr.name, ret); return ret; } } if (ps_attr->text_values_len > 0 && value.intval < ps_attr->text_values_len && value.intval >= 0) { return sprintf(buf, "%s\n", ps_attr->text_values[value.intval]); } switch (psp) { case POWER_SUPPLY_PROP_USB_TYPE: ret = power_supply_show_usb_type(dev, psy->desc, &value, buf); break; case POWER_SUPPLY_PROP_MODEL_NAME ... POWER_SUPPLY_PROP_SERIAL_NUMBER: ret = sprintf(buf, "%s\n", value.strval); break; default: ret = sprintf(buf, "%d\n", value.intval); } return ret; } static ssize_t power_supply_store_property(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { ssize_t ret; struct power_supply *psy = dev_get_drvdata(dev); struct power_supply_attr *ps_attr = to_ps_attr(attr); enum power_supply_property psp = dev_attr_psp(attr); union power_supply_propval value; ret = -EINVAL; if (ps_attr->text_values_len > 0) { ret = __sysfs_match_string(ps_attr->text_values, ps_attr->text_values_len, buf); } /* * If no match was found, then check to see if it is an integer. * Integer values are valid for enums in addition to the text value. */ if (ret < 0) { long long_val; ret = kstrtol(buf, 10, &long_val); if (ret < 0) return ret; ret = long_val; } value.intval = ret; ret = power_supply_set_property(psy, psp, &value); if (ret < 0) return ret; return count; } static umode_t power_supply_attr_is_visible(struct kobject *kobj, struct attribute *attr, int attrno) { struct device *dev = kobj_to_dev(kobj); struct power_supply *psy = dev_get_drvdata(dev); umode_t mode = S_IRUSR | S_IRGRP | S_IROTH; int i; if (!power_supply_attrs[attrno].prop_name) return 0; if (attrno == POWER_SUPPLY_PROP_TYPE) return mode; for (i = 0; i < psy->desc->num_properties; i++) { int property = psy->desc->properties[i]; if (property == attrno) { if (psy->desc->property_is_writeable && psy->desc->property_is_writeable(psy, property) > 0) mode |= S_IWUSR; return mode; } } return 0; } static const struct attribute_group power_supply_attr_group = { .attrs = __power_supply_attrs, .is_visible = power_supply_attr_is_visible, }; static const struct attribute_group *power_supply_attr_groups[] = { &power_supply_attr_group, NULL, }; static void str_to_lower(char *str) { while (*str) { *str = tolower(*str); str++; } } void power_supply_init_attrs(struct device_type *dev_type) { int i; dev_type->groups = power_supply_attr_groups; for (i = 0; i < ARRAY_SIZE(power_supply_attrs); i++) { struct device_attribute *attr; if (!power_supply_attrs[i].prop_name) { pr_warn("%s: Property %d skipped because it is missing from power_supply_attrs\n", __func__, i); sprintf(power_supply_attrs[i].attr_name, "_err_%d", i); } else { str_to_lower(power_supply_attrs[i].attr_name); } attr = &power_supply_attrs[i].dev_attr; attr->attr.name = power_supply_attrs[i].attr_name; attr->show = power_supply_show_property; attr->store = power_supply_store_property; __power_supply_attrs[i] = &attr->attr; } } static int add_prop_uevent(struct device *dev, struct kobj_uevent_env *env, enum power_supply_property prop, char *prop_buf) { int ret = 0; struct power_supply_attr *pwr_attr; struct device_attribute *dev_attr; char *line; pwr_attr = &power_supply_attrs[prop]; dev_attr = &pwr_attr->dev_attr; ret = power_supply_show_property(dev, dev_attr, prop_buf); if (ret == -ENODEV || ret == -ENODATA) { /* * When a battery is absent, we expect -ENODEV. Don't abort; * send the uevent with at least the PRESENT=0 property */ return 0; } if (ret < 0) return ret; line = strchr(prop_buf, '\n'); if (line) *line = 0; return add_uevent_var(env, "POWER_SUPPLY_%s=%s", pwr_attr->prop_name, prop_buf); } int power_supply_uevent(struct device *dev, struct kobj_uevent_env *env) { struct power_supply *psy = dev_get_drvdata(dev); int ret = 0, j; char *prop_buf; if (!psy || !psy->desc) { dev_dbg(dev, "No power supply yet\n"); return ret; } ret = add_uevent_var(env, "POWER_SUPPLY_NAME=%s", psy->desc->name); if (ret) return ret; prop_buf = (char *)get_zeroed_page(GFP_KERNEL); if (!prop_buf) return -ENOMEM; ret = add_prop_uevent(dev, env, POWER_SUPPLY_PROP_TYPE, prop_buf); if (ret) goto out; for (j = 0; j < psy->desc->num_properties; j++) { ret = add_prop_uevent(dev, env, psy->desc->properties[j], prop_buf); if (ret) goto out; } out: free_page((unsigned long)prop_buf); return ret; } ssize_t power_supply_charge_behaviour_show(struct device *dev, unsigned int available_behaviours, enum power_supply_charge_behaviour current_behaviour, char *buf) { bool match = false, available, active; ssize_t count = 0; int i; for (i = 0; i < ARRAY_SIZE(POWER_SUPPLY_CHARGE_BEHAVIOUR_TEXT); i++) { available = available_behaviours & BIT(i); active = i == current_behaviour; if (available && active) { count += sysfs_emit_at(buf, count, "[%s] ", POWER_SUPPLY_CHARGE_BEHAVIOUR_TEXT[i]); match = true; } else if (available) { count += sysfs_emit_at(buf, count, "%s ", POWER_SUPPLY_CHARGE_BEHAVIOUR_TEXT[i]); } } if (!match) { dev_warn(dev, "driver reporting unsupported charge behaviour\n"); return -EINVAL; } if (count) buf[count - 1] = '\n'; return count; } EXPORT_SYMBOL_GPL(power_supply_charge_behaviour_show); int power_supply_charge_behaviour_parse(unsigned int available_behaviours, const char *buf) { int i = sysfs_match_string(POWER_SUPPLY_CHARGE_BEHAVIOUR_TEXT, buf); if (i < 0) return i; if (available_behaviours & BIT(i)) return i; return -EINVAL; } EXPORT_SYMBOL_GPL(power_supply_charge_behaviour_parse); |
203 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TTY_PORT_H #define _LINUX_TTY_PORT_H #include <linux/kfifo.h> #include <linux/kref.h> #include <linux/mutex.h> #include <linux/tty_buffer.h> #include <linux/wait.h> #include <linux/android_kabi.h> struct attribute_group; struct tty_driver; struct tty_port; struct tty_struct; /** * struct tty_port_operations -- operations on tty_port * @carrier_raised: return 1 if the carrier is raised on @port * @dtr_rts: raise the DTR line if @raise is nonzero, otherwise lower DTR * @shutdown: called when the last close completes or a hangup finishes IFF the * port was initialized. Do not use to free resources. Turn off the device * only. Called under the port mutex to serialize against @activate and * @shutdown. * @activate: called under the port mutex from tty_port_open(), serialized using * the port mutex. Supposed to turn on the device. * * FIXME: long term getting the tty argument *out* of this would be good * for consoles. * * @destruct: called on the final put of a port. Free resources, possibly incl. * the port itself. */ struct tty_port_operations { int (*carrier_raised)(struct tty_port *port); void (*dtr_rts)(struct tty_port *port, int raise); void (*shutdown)(struct tty_port *port); int (*activate)(struct tty_port *port, struct tty_struct *tty); void (*destruct)(struct tty_port *port); ANDROID_KABI_RESERVE(1); }; struct tty_port_client_operations { int (*receive_buf)(struct tty_port *port, const unsigned char *, const unsigned char *, size_t); void (*lookahead_buf)(struct tty_port *port, const unsigned char *cp, const unsigned char *fp, unsigned int count); void (*write_wakeup)(struct tty_port *port); }; extern const struct tty_port_client_operations tty_port_default_client_ops; /** * struct tty_port -- port level information * * @buf: buffer for this port, locked internally * @tty: back pointer to &struct tty_struct, valid only if the tty is open. Use * tty_port_tty_get() to obtain it (and tty_kref_put() to release). * @itty: internal back pointer to &struct tty_struct. Avoid this. It should be * eliminated in the long term. * @ops: tty port operations (like activate, shutdown), see &struct * tty_port_operations * @client_ops: tty port client operations (like receive_buf, write_wakeup). * By default, tty_port_default_client_ops is used. * @lock: lock protecting @tty * @blocked_open: # of procs waiting for open in tty_port_block_til_ready() * @count: usage count * @open_wait: open waiters queue (waiting e.g. for a carrier) * @delta_msr_wait: modem status change queue (waiting for MSR changes) * @flags: user TTY flags (%ASYNC_) * @iflags: internal flags (%TTY_PORT_) * @console: when set, the port is a console * @mutex: locking, for open, shutdown and other port operations * @buf_mutex: @xmit_buf alloc lock * @xmit_buf: optional xmit buffer used by some drivers * @xmit_fifo: optional xmit buffer used by some drivers * @close_delay: delay in jiffies to wait when closing the port * @closing_wait: delay in jiffies for output to be sent before closing * @drain_delay: set to zero if no pure time based drain is needed else set to * size of fifo * @kref: references counter. Reaching zero calls @ops->destruct() if non-%NULL * or frees the port otherwise. * @client_data: pointer to private data, for @client_ops * * Each device keeps its own port level information. &struct tty_port was * introduced as a common structure for such information. As every TTY device * shall have a backing tty_port structure, every driver can use these members. * * The tty port has a different lifetime to the tty so must be kept apart. * In addition be careful as tty -> port mappings are valid for the life * of the tty object but in many cases port -> tty mappings are valid only * until a hangup so don't use the wrong path. * * Tty port shall be initialized by tty_port_init() and shut down either by * tty_port_destroy() (refcounting not used), or tty_port_put() (refcounting). * * There is a lot of helpers around &struct tty_port too. To name the most * significant ones: tty_port_open(), tty_port_close() (or * tty_port_close_start() and tty_port_close_end() separately if need be), and * tty_port_hangup(). These call @ops->activate() and @ops->shutdown() as * needed. */ struct tty_port { struct tty_bufhead buf; struct tty_struct *tty; struct tty_struct *itty; const struct tty_port_operations *ops; const struct tty_port_client_operations *client_ops; spinlock_t lock; int blocked_open; int count; wait_queue_head_t open_wait; wait_queue_head_t delta_msr_wait; unsigned long flags; unsigned long iflags; unsigned char console:1; struct mutex mutex; struct mutex buf_mutex; unsigned char *xmit_buf; DECLARE_KFIFO_PTR(xmit_fifo, unsigned char); unsigned int close_delay; unsigned int closing_wait; int drain_delay; struct kref kref; void *client_data; ANDROID_KABI_RESERVE(1); }; /* tty_port::iflags bits -- use atomic bit ops */ #define TTY_PORT_INITIALIZED 0 /* device is initialized */ #define TTY_PORT_SUSPENDED 1 /* device is suspended */ #define TTY_PORT_ACTIVE 2 /* device is open */ /* * uart drivers: use the uart_port::status field and the UPSTAT_* defines * for s/w-based flow control steering and carrier detection status */ #define TTY_PORT_CTS_FLOW 3 /* h/w flow control enabled */ #define TTY_PORT_CHECK_CD 4 /* carrier detect enabled */ #define TTY_PORT_KOPENED 5 /* device exclusively opened by kernel */ void tty_port_init(struct tty_port *port); void tty_port_link_device(struct tty_port *port, struct tty_driver *driver, unsigned index); struct device *tty_port_register_device(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device); struct device *tty_port_register_device_attr(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp); struct device *tty_port_register_device_serdev(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device); struct device *tty_port_register_device_attr_serdev(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp); void tty_port_unregister_device(struct tty_port *port, struct tty_driver *driver, unsigned index); int tty_port_alloc_xmit_buf(struct tty_port *port); void tty_port_free_xmit_buf(struct tty_port *port); void tty_port_destroy(struct tty_port *port); void tty_port_put(struct tty_port *port); static inline struct tty_port *tty_port_get(struct tty_port *port) { if (port && kref_get_unless_zero(&port->kref)) return port; return NULL; } /* If the cts flow control is enabled, return true. */ static inline bool tty_port_cts_enabled(const struct tty_port *port) { return test_bit(TTY_PORT_CTS_FLOW, &port->iflags); } static inline void tty_port_set_cts_flow(struct tty_port *port, bool val) { assign_bit(TTY_PORT_CTS_FLOW, &port->iflags, val); } static inline bool tty_port_active(const struct tty_port *port) { return test_bit(TTY_PORT_ACTIVE, &port->iflags); } static inline void tty_port_set_active(struct tty_port *port, bool val) { assign_bit(TTY_PORT_ACTIVE, &port->iflags, val); } static inline bool tty_port_check_carrier(const struct tty_port *port) { return test_bit(TTY_PORT_CHECK_CD, &port->iflags); } static inline void tty_port_set_check_carrier(struct tty_port *port, bool val) { assign_bit(TTY_PORT_CHECK_CD, &port->iflags, val); } static inline bool tty_port_suspended(const struct tty_port *port) { return test_bit(TTY_PORT_SUSPENDED, &port->iflags); } static inline void tty_port_set_suspended(struct tty_port *port, bool val) { assign_bit(TTY_PORT_SUSPENDED, &port->iflags, val); } static inline bool tty_port_initialized(const struct tty_port *port) { return test_bit(TTY_PORT_INITIALIZED, &port->iflags); } static inline void tty_port_set_initialized(struct tty_port *port, bool val) { assign_bit(TTY_PORT_INITIALIZED, &port->iflags, val); } static inline bool tty_port_kopened(const struct tty_port *port) { return test_bit(TTY_PORT_KOPENED, &port->iflags); } static inline void tty_port_set_kopened(struct tty_port *port, bool val) { assign_bit(TTY_PORT_KOPENED, &port->iflags, val); } struct tty_struct *tty_port_tty_get(struct tty_port *port); void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty); int tty_port_carrier_raised(struct tty_port *port); void tty_port_raise_dtr_rts(struct tty_port *port); void tty_port_lower_dtr_rts(struct tty_port *port); void tty_port_hangup(struct tty_port *port); void tty_port_tty_hangup(struct tty_port *port, bool check_clocal); void tty_port_tty_wakeup(struct tty_port *port); int tty_port_block_til_ready(struct tty_port *port, struct tty_struct *tty, struct file *filp); int tty_port_close_start(struct tty_port *port, struct tty_struct *tty, struct file *filp); void tty_port_close_end(struct tty_port *port, struct tty_struct *tty); void tty_port_close(struct tty_port *port, struct tty_struct *tty, struct file *filp); int tty_port_install(struct tty_port *port, struct tty_driver *driver, struct tty_struct *tty); int tty_port_open(struct tty_port *port, struct tty_struct *tty, struct file *filp); static inline int tty_port_users(struct tty_port *port) { return port->count + port->blocked_open; } #endif |
27 1 5 2 19 17 2 4 4 4 17 16 10 1 4 1 2 2 1 1 1 10 1 1 1 425 425 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/act_gact.c Generic actions * * copyright Jamal Hadi Salim (2002-4) */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/init.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <linux/tc_act/tc_gact.h> #include <net/tc_act/tc_gact.h> static struct tc_action_ops act_gact_ops; #ifdef CONFIG_GACT_PROB static int gact_net_rand(struct tcf_gact *gact) { smp_rmb(); /* coupled with smp_wmb() in tcf_gact_init() */ if (prandom_u32_max(gact->tcfg_pval)) return gact->tcf_action; return gact->tcfg_paction; } static int gact_determ(struct tcf_gact *gact) { u32 pack = atomic_inc_return(&gact->packets); smp_rmb(); /* coupled with smp_wmb() in tcf_gact_init() */ if (pack % gact->tcfg_pval) return gact->tcf_action; return gact->tcfg_paction; } typedef int (*g_rand)(struct tcf_gact *gact); static g_rand gact_rand[MAX_RAND] = { NULL, gact_net_rand, gact_determ }; #endif /* CONFIG_GACT_PROB */ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { [TCA_GACT_PARMS] = { .len = sizeof(struct tc_gact) }, [TCA_GACT_PROB] = { .len = sizeof(struct tc_gact_p) }, }; static int tcf_gact_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_gact_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_GACT_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tc_gact *parm; struct tcf_gact *gact; int ret = 0; u32 index; int err; #ifdef CONFIG_GACT_PROB struct tc_gact_p *p_parm = NULL; #endif if (nla == NULL) return -EINVAL; err = nla_parse_nested_deprecated(tb, TCA_GACT_MAX, nla, gact_policy, NULL); if (err < 0) return err; if (tb[TCA_GACT_PARMS] == NULL) return -EINVAL; parm = nla_data(tb[TCA_GACT_PARMS]); index = parm->index; #ifndef CONFIG_GACT_PROB if (tb[TCA_GACT_PROB] != NULL) return -EOPNOTSUPP; #else if (tb[TCA_GACT_PROB]) { p_parm = nla_data(tb[TCA_GACT_PROB]); if (p_parm->ptype >= MAX_RAND) return -EINVAL; if (TC_ACT_EXT_CMP(p_parm->paction, TC_ACT_GOTO_CHAIN)) { NL_SET_ERR_MSG(extack, "goto chain not allowed on fallback"); return -EINVAL; } } #endif err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { ret = tcf_idr_create_from_flags(tn, index, est, a, &act_gact_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (err > 0) { if (bind)/* dont override defaults */ return 0; if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } } else { return err; } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; gact = to_gact(*a); spin_lock_bh(&gact->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); #ifdef CONFIG_GACT_PROB if (p_parm) { gact->tcfg_paction = p_parm->paction; gact->tcfg_pval = max_t(u16, 1, p_parm->pval); /* Make sure tcfg_pval is written before tcfg_ptype * coupled with smp_rmb() in gact_net_rand() & gact_determ() */ smp_wmb(); gact->tcfg_ptype = p_parm->ptype; } #endif spin_unlock_bh(&gact->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); return ret; release_idr: tcf_idr_release(*a, bind); return err; } static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_gact *gact = to_gact(a); int action = READ_ONCE(gact->tcf_action); #ifdef CONFIG_GACT_PROB { u32 ptype = READ_ONCE(gact->tcfg_ptype); if (ptype) action = gact_rand[ptype](gact); } #endif tcf_action_update_bstats(&gact->common, skb); if (action == TC_ACT_SHOT) tcf_action_inc_drop_qstats(&gact->common); tcf_lastuse_update(&gact->tcf_tm); return action; } static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_gact *gact = to_gact(a); int action = READ_ONCE(gact->tcf_action); struct tcf_t *tm = &gact->tcf_tm; tcf_action_update_stats(a, bytes, packets, action == TC_ACT_SHOT ? packets : drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_gact *gact = to_gact(a); struct tc_gact opt = { .index = gact->tcf_index, .refcnt = refcount_read(&gact->tcf_refcnt) - ref, .bindcnt = atomic_read(&gact->tcf_bindcnt) - bind, }; struct tcf_t t; spin_lock_bh(&gact->tcf_lock); opt.action = gact->tcf_action; if (nla_put(skb, TCA_GACT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; #ifdef CONFIG_GACT_PROB if (gact->tcfg_ptype) { struct tc_gact_p p_opt = { .paction = gact->tcfg_paction, .pval = gact->tcfg_pval, .ptype = gact->tcfg_ptype, }; if (nla_put(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt)) goto nla_put_failure; } #endif tcf_tm_dump(&t, &gact->tcf_tm); if (nla_put_64bit(skb, TCA_GACT_TM, sizeof(t), &t, TCA_GACT_PAD)) goto nla_put_failure; spin_unlock_bh(&gact->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&gact->tcf_lock); nlmsg_trim(skb, b); return -1; } static size_t tcf_gact_get_fill_size(const struct tc_action *act) { size_t sz = nla_total_size(sizeof(struct tc_gact)); /* TCA_GACT_PARMS */ #ifdef CONFIG_GACT_PROB if (to_gact(act)->tcfg_ptype) /* TCA_GACT_PROB */ sz += nla_total_size(sizeof(struct tc_gact_p)); #endif return sz; } static int tcf_gact_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { if (bind) { struct flow_action_entry *entry = entry_data; if (is_tcf_gact_ok(act)) { entry->id = FLOW_ACTION_ACCEPT; } else if (is_tcf_gact_shot(act)) { entry->id = FLOW_ACTION_DROP; } else if (is_tcf_gact_trap(act)) { entry->id = FLOW_ACTION_TRAP; } else if (is_tcf_gact_goto_chain(act)) { entry->id = FLOW_ACTION_GOTO; entry->chain_index = tcf_gact_goto_chain_index(act); } else if (is_tcf_gact_continue(act)) { NL_SET_ERR_MSG_MOD(extack, "Offload of \"continue\" action is not supported"); return -EOPNOTSUPP; } else if (is_tcf_gact_reclassify(act)) { NL_SET_ERR_MSG_MOD(extack, "Offload of \"reclassify\" action is not supported"); return -EOPNOTSUPP; } else if (is_tcf_gact_pipe(act)) { NL_SET_ERR_MSG_MOD(extack, "Offload of \"pipe\" action is not supported"); return -EOPNOTSUPP; } else { NL_SET_ERR_MSG_MOD(extack, "Unsupported generic action offload"); return -EOPNOTSUPP; } *index_inc = 1; } else { struct flow_offload_action *fl_action = entry_data; if (is_tcf_gact_ok(act)) fl_action->id = FLOW_ACTION_ACCEPT; else if (is_tcf_gact_shot(act)) fl_action->id = FLOW_ACTION_DROP; else if (is_tcf_gact_trap(act)) fl_action->id = FLOW_ACTION_TRAP; else if (is_tcf_gact_goto_chain(act)) fl_action->id = FLOW_ACTION_GOTO; else return -EOPNOTSUPP; } return 0; } static struct tc_action_ops act_gact_ops = { .kind = "gact", .id = TCA_ID_GACT, .owner = THIS_MODULE, .act = tcf_gact_act, .stats_update = tcf_gact_stats_update, .dump = tcf_gact_dump, .init = tcf_gact_init, .get_fill_size = tcf_gact_get_fill_size, .offload_act_setup = tcf_gact_offload_act_setup, .size = sizeof(struct tcf_gact), }; static __net_init int gact_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_gact_ops.net_id); return tc_action_net_init(net, tn, &act_gact_ops); } static void __net_exit gact_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_gact_ops.net_id); } static struct pernet_operations gact_net_ops = { .init = gact_init_net, .exit_batch = gact_exit_net, .id = &act_gact_ops.net_id, .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); MODULE_DESCRIPTION("Generic Classifier actions"); MODULE_LICENSE("GPL"); static int __init gact_init_module(void) { #ifdef CONFIG_GACT_PROB pr_info("GACT probability on\n"); #else pr_info("GACT probability NOT on\n"); #endif return tcf_register_action(&act_gact_ops, &gact_net_ops); } static void __exit gact_cleanup_module(void) { tcf_unregister_action(&act_gact_ops, &gact_net_ops); } module_init(gact_init_module); module_exit(gact_cleanup_module); |
15 145 7 104 109 94 93 79 108 205 44 44 109 109 7 28 43 18 2 40 18 39 1 3 34 20 1 20 4 26 48 20 28 10 3 9 75 31 44 3 1 71 17 17 42 42 43 43 43 36 70 70 12 58 58 42 42 42 42 4 42 41 4 40 4 40 17 17 17 17 186 161 11 162 186 186 45 45 151 151 78 79 15 14 62 66 66 66 66 62 5 62 62 62 5 19 17 1 1 17 17 17 17 1 17 17 17 17 14 2 17 1 43 44 40 6 7 40 146 146 146 111 101 101 90 12 102 38 3 35 35 34 35 35 1 40 9 3 35 7 35 15 4 2 10 6 14 3 1 3 8 5 48 48 3 46 12 8 41 19 34 20 20 19 5 47 20 20 20 20 19 17 7 7 18 19 2 2 3 3 11 3 7 1 2 1 1 1 1 3 3 37 37 35 35 35 34 27 6 37 37 6 115 109 37 425 425 3 3 9 9 9 10 1 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 | // SPDX-License-Identifier: GPL-2.0-only /* Connection state tracking for netfilter. This is separated from, but required by, the NAT layer; it can also be used by an iptables extension. */ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> * (C) 2005-2012 Patrick McHardy <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/netfilter.h> #include <linux/module.h> #include <linux/sched.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/vmalloc.h> #include <linux/stddef.h> #include <linux/slab.h> #include <linux/random.h> #include <linux/siphash.h> #include <linux/err.h> #include <linux/percpu.h> #include <linux/moduleparam.h> #include <linux/notifier.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/socket.h> #include <linux/mm.h> #include <linux/nsproxy.h> #include <linux/rculist_nulls.h> #include <trace/hooks/net.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_bpf.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_timestamp.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_labels.h> #include <net/netfilter/nf_conntrack_synproxy.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_helper.h> #include <net/netns/hash.h> #include <net/ip.h> #include "nf_internals.h" __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; EXPORT_SYMBOL_GPL(nf_conntrack_locks); __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); struct hlist_nulls_head *nf_conntrack_hash __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_hash); struct conntrack_gc_work { struct delayed_work dwork; u32 next_bucket; u32 avg_timeout; u32 count; u32 start_time; bool exiting; bool early_drop; }; static __read_mostly struct kmem_cache *nf_conntrack_cachep; static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); static __read_mostly bool nf_conntrack_locks_all; /* serialize hash resizes and nf_ct_iterate_cleanup */ static DEFINE_MUTEX(nf_conntrack_mutex); #define GC_SCAN_INTERVAL_MAX (60ul * HZ) #define GC_SCAN_INTERVAL_MIN (1ul * HZ) /* clamp timeouts to this value (TCP unacked) */ #define GC_SCAN_INTERVAL_CLAMP (300ul * HZ) /* Initial bias pretending we have 100 entries at the upper bound so we don't * wakeup often just because we have three entries with a 1s timeout while still * allowing non-idle machines to wakeup more often when needed. */ #define GC_SCAN_INITIAL_COUNT 100 #define GC_SCAN_INTERVAL_INIT GC_SCAN_INTERVAL_MAX #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) #define GC_SCAN_EXPIRED_MAX (64000u / HZ) #define MIN_CHAINLEN 50u #define MAX_CHAINLEN (80u - MIN_CHAINLEN) static struct conntrack_gc_work conntrack_gc_work; void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) { /* 1) Acquire the lock */ spin_lock(lock); /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics * It pairs with the smp_store_release() in nf_conntrack_all_unlock() */ if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false)) return; /* fast path failed, unlock */ spin_unlock(lock); /* Slow path 1) get global lock */ spin_lock(&nf_conntrack_locks_all_lock); /* Slow path 2) get the lock we want */ spin_lock(lock); /* Slow path 3) release the global lock */ spin_unlock(&nf_conntrack_locks_all_lock); } EXPORT_SYMBOL_GPL(nf_conntrack_lock); static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) { h1 %= CONNTRACK_LOCKS; h2 %= CONNTRACK_LOCKS; spin_unlock(&nf_conntrack_locks[h1]); if (h1 != h2) spin_unlock(&nf_conntrack_locks[h2]); } /* return true if we need to recompute hashes (in case hash table was resized) */ static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, unsigned int h2, unsigned int sequence) { h1 %= CONNTRACK_LOCKS; h2 %= CONNTRACK_LOCKS; if (h1 <= h2) { nf_conntrack_lock(&nf_conntrack_locks[h1]); if (h1 != h2) spin_lock_nested(&nf_conntrack_locks[h2], SINGLE_DEPTH_NESTING); } else { nf_conntrack_lock(&nf_conntrack_locks[h2]); spin_lock_nested(&nf_conntrack_locks[h1], SINGLE_DEPTH_NESTING); } if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { nf_conntrack_double_unlock(h1, h2); return true; } return false; } static void nf_conntrack_all_lock(void) __acquires(&nf_conntrack_locks_all_lock) { int i; spin_lock(&nf_conntrack_locks_all_lock); /* For nf_contrack_locks_all, only the latest time when another * CPU will see an update is controlled, by the "release" of the * spin_lock below. * The earliest time is not controlled, an thus KCSAN could detect * a race when nf_conntract_lock() reads the variable. * WRITE_ONCE() is used to ensure the compiler will not * optimize the write. */ WRITE_ONCE(nf_conntrack_locks_all, true); for (i = 0; i < CONNTRACK_LOCKS; i++) { spin_lock(&nf_conntrack_locks[i]); /* This spin_unlock provides the "release" to ensure that * nf_conntrack_locks_all==true is visible to everyone that * acquired spin_lock(&nf_conntrack_locks[]). */ spin_unlock(&nf_conntrack_locks[i]); } } static void nf_conntrack_all_unlock(void) __releases(&nf_conntrack_locks_all_lock) { /* All prior stores must be complete before we clear * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock() * might observe the false value but not the entire * critical section. * It pairs with the smp_load_acquire() in nf_conntrack_lock() */ smp_store_release(&nf_conntrack_locks_all, false); spin_unlock(&nf_conntrack_locks_all_lock); } unsigned int nf_conntrack_htable_size __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); unsigned int nf_conntrack_max __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_max); seqcount_spinlock_t nf_conntrack_generation __read_mostly; static siphash_aligned_key_t nf_conntrack_hash_rnd; static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, unsigned int zoneid, const struct net *net) { struct { struct nf_conntrack_man src; union nf_inet_addr dst_addr; unsigned int zone; u32 net_mix; u16 dport; u16 proto; } __aligned(SIPHASH_ALIGNMENT) combined; get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); memset(&combined, 0, sizeof(combined)); /* The direction must be ignored, so handle usable members manually. */ combined.src = tuple->src; combined.dst_addr = tuple->dst.u3; combined.zone = zoneid; combined.net_mix = net_hash_mix(net); combined.dport = (__force __u16)tuple->dst.u.all; combined.proto = tuple->dst.protonum; return (u32)siphash(&combined, sizeof(combined), &nf_conntrack_hash_rnd); } static u32 scale_hash(u32 hash) { return reciprocal_scale(hash, nf_conntrack_htable_size); } static u32 __hash_conntrack(const struct net *net, const struct nf_conntrack_tuple *tuple, unsigned int zoneid, unsigned int size) { return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size); } static u32 hash_conntrack(const struct net *net, const struct nf_conntrack_tuple *tuple, unsigned int zoneid) { return scale_hash(hash_conntrack_raw(tuple, zoneid, net)); } static bool nf_ct_get_tuple_ports(const struct sk_buff *skb, unsigned int dataoff, struct nf_conntrack_tuple *tuple) { struct { __be16 sport; __be16 dport; } _inet_hdr, *inet_hdr; /* Actually only need first 4 bytes to get ports. */ inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr); if (!inet_hdr) return false; tuple->src.u.udp.port = inet_hdr->sport; tuple->dst.u.udp.port = inet_hdr->dport; return true; } static bool nf_ct_get_tuple(const struct sk_buff *skb, unsigned int nhoff, unsigned int dataoff, u_int16_t l3num, u_int8_t protonum, struct net *net, struct nf_conntrack_tuple *tuple) { unsigned int size; const __be32 *ap; __be32 _addrs[8]; memset(tuple, 0, sizeof(*tuple)); tuple->src.l3num = l3num; switch (l3num) { case NFPROTO_IPV4: nhoff += offsetof(struct iphdr, saddr); size = 2 * sizeof(__be32); break; case NFPROTO_IPV6: nhoff += offsetof(struct ipv6hdr, saddr); size = sizeof(_addrs); break; default: return true; } ap = skb_header_pointer(skb, nhoff, size, _addrs); if (!ap) return false; switch (l3num) { case NFPROTO_IPV4: tuple->src.u3.ip = ap[0]; tuple->dst.u3.ip = ap[1]; break; case NFPROTO_IPV6: memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); break; } tuple->dst.protonum = protonum; tuple->dst.dir = IP_CT_DIR_ORIGINAL; switch (protonum) { #if IS_ENABLED(CONFIG_IPV6) case IPPROTO_ICMPV6: return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple); #endif case IPPROTO_ICMP: return icmp_pkt_to_tuple(skb, dataoff, net, tuple); #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: return gre_pkt_to_tuple(skb, dataoff, net, tuple); #endif case IPPROTO_TCP: case IPPROTO_UDP: #ifdef CONFIG_NF_CT_PROTO_UDPLITE case IPPROTO_UDPLITE: #endif #ifdef CONFIG_NF_CT_PROTO_SCTP case IPPROTO_SCTP: #endif #ifdef CONFIG_NF_CT_PROTO_DCCP case IPPROTO_DCCP: #endif /* fallthrough */ return nf_ct_get_tuple_ports(skb, dataoff, tuple); default: break; } return true; } static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, u_int8_t *protonum) { int dataoff = -1; const struct iphdr *iph; struct iphdr _iph; iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); if (!iph) return -1; /* Conntrack defragments packets, we might still see fragments * inside ICMP packets though. */ if (iph->frag_off & htons(IP_OFFSET)) return -1; dataoff = nhoff + (iph->ihl << 2); *protonum = iph->protocol; /* Check bogus IP headers */ if (dataoff > skb->len) { pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n", nhoff, iph->ihl << 2, skb->len); return -1; } return dataoff; } #if IS_ENABLED(CONFIG_IPV6) static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, u8 *protonum) { int protoff = -1; unsigned int extoff = nhoff + sizeof(struct ipv6hdr); __be16 frag_off; u8 nexthdr; if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), &nexthdr, sizeof(nexthdr)) != 0) { pr_debug("can't get nexthdr\n"); return -1; } protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off); /* * (protoff == skb->len) means the packet has not data, just * IPv6 and possibly extensions headers, but it is tracked anyway */ if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { pr_debug("can't find proto in pkt\n"); return -1; } *protonum = nexthdr; return protoff; } #endif static int get_l4proto(const struct sk_buff *skb, unsigned int nhoff, u8 pf, u8 *l4num) { switch (pf) { case NFPROTO_IPV4: return ipv4_get_l4proto(skb, nhoff, l4num); #if IS_ENABLED(CONFIG_IPV6) case NFPROTO_IPV6: return ipv6_get_l4proto(skb, nhoff, l4num); #endif default: *l4num = 0; break; } return -1; } bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, u_int16_t l3num, struct net *net, struct nf_conntrack_tuple *tuple) { u8 protonum; int protoff; protoff = get_l4proto(skb, nhoff, l3num, &protonum); if (protoff <= 0) return false; return nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple); } EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_tuple *orig) { memset(inverse, 0, sizeof(*inverse)); inverse->src.l3num = orig->src.l3num; switch (orig->src.l3num) { case NFPROTO_IPV4: inverse->src.u3.ip = orig->dst.u3.ip; inverse->dst.u3.ip = orig->src.u3.ip; break; case NFPROTO_IPV6: inverse->src.u3.in6 = orig->dst.u3.in6; inverse->dst.u3.in6 = orig->src.u3.in6; break; default: break; } inverse->dst.dir = !orig->dst.dir; inverse->dst.protonum = orig->dst.protonum; switch (orig->dst.protonum) { case IPPROTO_ICMP: return nf_conntrack_invert_icmp_tuple(inverse, orig); #if IS_ENABLED(CONFIG_IPV6) case IPPROTO_ICMPV6: return nf_conntrack_invert_icmpv6_tuple(inverse, orig); #endif } inverse->src.u.all = orig->dst.u.all; inverse->dst.u.all = orig->src.u.all; return true; } EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); /* Generate a almost-unique pseudo-id for a given conntrack. * * intentionally doesn't re-use any of the seeds used for hash * table location, we assume id gets exposed to userspace. * * Following nf_conn items do not change throughout lifetime * of the nf_conn: * * 1. nf_conn address * 2. nf_conn->master address (normally NULL) * 3. the associated net namespace * 4. the original direction tuple */ u32 nf_ct_get_id(const struct nf_conn *ct) { static siphash_aligned_key_t ct_id_seed; unsigned long a, b, c, d; net_get_random_once(&ct_id_seed, sizeof(ct_id_seed)); a = (unsigned long)ct; b = (unsigned long)ct->master; c = (unsigned long)nf_ct_net(ct); d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple), &ct_id_seed); #ifdef CONFIG_64BIT return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed); #else return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed); #endif } EXPORT_SYMBOL_GPL(nf_ct_get_id); static void clean_from_lists(struct nf_conn *ct) { pr_debug("clean_from_lists(%p)\n", ct); hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); /* Destroy all pending expectations */ nf_ct_remove_expectations(ct); } #define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK) /* Released via nf_ct_destroy() */ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, const struct nf_conntrack_zone *zone, gfp_t flags) { struct nf_conn *tmpl, *p; if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) { tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags); if (!tmpl) return NULL; p = tmpl; tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); if (tmpl != p) { tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p; } } else { tmpl = kzalloc(sizeof(*tmpl), flags); if (!tmpl) return NULL; } tmpl->status = IPS_TEMPLATE; write_pnet(&tmpl->ct_net, net); nf_ct_zone_add(tmpl, zone); refcount_set(&tmpl->ct_general.use, 1); return tmpl; } EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); void nf_ct_tmpl_free(struct nf_conn *tmpl) { kfree(tmpl->ext); if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) kfree((char *)tmpl - tmpl->proto.tmpl_padto); else kfree(tmpl); } EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); static void destroy_gre_conntrack(struct nf_conn *ct) { #ifdef CONFIG_NF_CT_PROTO_GRE struct nf_conn *master = ct->master; if (master) nf_ct_gre_keymap_destroy(master); #endif } void nf_ct_destroy(struct nf_conntrack *nfct) { struct nf_conn *ct = (struct nf_conn *)nfct; pr_debug("%s(%p)\n", __func__, ct); WARN_ON(refcount_read(&nfct->use) != 0); if (unlikely(nf_ct_is_template(ct))) { nf_ct_tmpl_free(ct); return; } if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE)) destroy_gre_conntrack(ct); /* Expectations will have been removed in clean_from_lists, * except TFTP can create an expectation on the first packet, * before connection is in the list, so we need to clean here, * too. */ nf_ct_remove_expectations(ct); if (ct->master) nf_ct_put(ct->master); pr_debug("%s: returning ct=%p to slab\n", __func__, ct); nf_conntrack_free(ct); } EXPORT_SYMBOL(nf_ct_destroy); static void __nf_ct_delete_from_lists(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); unsigned int hash, reply_hash; unsigned int sequence; do { sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); clean_from_lists(ct); nf_conntrack_double_unlock(hash, reply_hash); } static void nf_ct_delete_from_lists(struct nf_conn *ct) { nf_ct_helper_destroy(ct); local_bh_disable(); __nf_ct_delete_from_lists(ct); local_bh_enable(); } static void nf_ct_add_to_ecache_list(struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_EVENTS struct nf_conntrack_net *cnet = nf_ct_pernet(nf_ct_net(ct)); spin_lock(&cnet->ecache.dying_lock); hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, &cnet->ecache.dying_list); spin_unlock(&cnet->ecache.dying_lock); #endif } bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) { struct nf_conn_tstamp *tstamp; struct net *net; if (test_and_set_bit(IPS_DYING_BIT, &ct->status)) return false; tstamp = nf_conn_tstamp_find(ct); if (tstamp) { s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp; tstamp->stop = ktime_get_real_ns(); if (timeout < 0) tstamp->stop -= jiffies_to_nsecs(-timeout); } if (nf_conntrack_event_report(IPCT_DESTROY, ct, portid, report) < 0) { /* destroy event was not delivered. nf_ct_put will * be done by event cache worker on redelivery. */ nf_ct_helper_destroy(ct); local_bh_disable(); __nf_ct_delete_from_lists(ct); nf_ct_add_to_ecache_list(ct); local_bh_enable(); nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL); return false; } net = nf_ct_net(ct); if (nf_conntrack_ecache_dwork_pending(net)) nf_conntrack_ecache_work(net, NFCT_ECACHE_DESTROY_SENT); nf_ct_delete_from_lists(ct); nf_ct_put(ct); return true; } EXPORT_SYMBOL_GPL(nf_ct_delete); static inline bool nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone, const struct net *net) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); /* A conntrack can be recreated with the equal tuple, * so we need to check that the conntrack is confirmed */ return nf_ct_tuple_equal(tuple, &h->tuple) && nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) && nf_ct_is_confirmed(ct) && net_eq(net, nf_ct_net(ct)); } static inline bool nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2) { return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) && nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) && nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) && net_eq(nf_ct_net(ct1), nf_ct_net(ct2)); } /* caller must hold rcu readlock and none of the nf_conntrack_locks */ static void nf_ct_gc_expired(struct nf_conn *ct) { if (!refcount_inc_not_zero(&ct->ct_general.use)) return; /* load ->status after refcount increase */ smp_acquire__after_ctrl_dep(); if (nf_ct_should_gc(ct)) nf_ct_kill(ct); nf_ct_put(ct); } /* * Warning : * - Caller must take a reference on returned object * and recheck nf_ct_tuple_equal(tuple, &h->tuple) */ static struct nf_conntrack_tuple_hash * ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, u32 hash) { struct nf_conntrack_tuple_hash *h; struct hlist_nulls_head *ct_hash; struct hlist_nulls_node *n; unsigned int bucket, hsize; begin: nf_conntrack_get_ht(&ct_hash, &hsize); bucket = reciprocal_scale(hash, hsize); hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { struct nf_conn *ct; ct = nf_ct_tuplehash_to_ctrack(h); if (nf_ct_is_expired(ct)) { nf_ct_gc_expired(ct); continue; } if (nf_ct_key_equal(h, tuple, zone, net)) return h; } /* * if the nulls value we got at the end of this lookup is * not the expected one, we must restart lookup. * We probably met an item that was moved to another chain. */ if (get_nulls_value(n) != bucket) { NF_CT_STAT_INC_ATOMIC(net, search_restart); goto begin; } return NULL; } /* Find a connection corresponding to a tuple. */ static struct nf_conntrack_tuple_hash * __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, u32 hash) { struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; rcu_read_lock(); h = ____nf_conntrack_find(net, zone, tuple, hash); if (h) { /* We have a candidate that matches the tuple we're interested * in, try to obtain a reference and re-check tuple */ ct = nf_ct_tuplehash_to_ctrack(h); if (likely(refcount_inc_not_zero(&ct->ct_general.use))) { /* re-check key after refcount */ smp_acquire__after_ctrl_dep(); if (likely(nf_ct_key_equal(h, tuple, zone, net))) goto found; /* TYPESAFE_BY_RCU recycled the candidate */ nf_ct_put(ct); } h = NULL; } found: rcu_read_unlock(); return h; } struct nf_conntrack_tuple_hash * nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); struct nf_conntrack_tuple_hash *thash; thash = __nf_conntrack_find_get(net, zone, tuple, hash_conntrack_raw(tuple, zone_id, net)); if (thash) return thash; rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); if (rid != zone_id) return __nf_conntrack_find_get(net, zone, tuple, hash_conntrack_raw(tuple, rid, net)); return thash; } EXPORT_SYMBOL_GPL(nf_conntrack_find_get); static void __nf_conntrack_hash_insert(struct nf_conn *ct, unsigned int hash, unsigned int reply_hash) { hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, &nf_conntrack_hash[hash]); hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &nf_conntrack_hash[reply_hash]); } static bool nf_ct_ext_valid_pre(const struct nf_ct_ext *ext) { /* if ext->gen_id is not equal to nf_conntrack_ext_genid, some extensions * may contain stale pointers to e.g. helper that has been removed. * * The helper can't clear this because the nf_conn object isn't in * any hash and synchronize_rcu() isn't enough because associated skb * might sit in a queue. */ return !ext || ext->gen_id == atomic_read(&nf_conntrack_ext_genid); } static bool nf_ct_ext_valid_post(struct nf_ct_ext *ext) { if (!ext) return true; if (ext->gen_id != atomic_read(&nf_conntrack_ext_genid)) return false; /* inserted into conntrack table, nf_ct_iterate_cleanup() * will find it. Disable nf_ct_ext_find() id check. */ WRITE_ONCE(ext->gen_id, 0); return true; } int nf_conntrack_hash_check_insert(struct nf_conn *ct) { const struct nf_conntrack_zone *zone; struct net *net = nf_ct_net(ct); unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; unsigned int max_chainlen; unsigned int chainlen = 0; unsigned int sequence; int err = -EEXIST; zone = nf_ct_zone(ct); if (!nf_ct_ext_valid_pre(ct->ext)) return -EAGAIN; local_bh_disable(); do { sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN); /* See if there's one in the list already, including reverse */ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, zone, net)) goto out; if (chainlen++ > max_chainlen) goto chaintoolong; } chainlen = 0; hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) goto out; if (chainlen++ > max_chainlen) goto chaintoolong; } /* If genid has changed, we can't insert anymore because ct * extensions could have stale pointers and nf_ct_iterate_destroy * might have completed its table scan already. * * Increment of the ext genid right after this check is fine: * nf_ct_iterate_destroy blocks until locks are released. */ if (!nf_ct_ext_valid_post(ct->ext)) { err = -EAGAIN; goto out; } smp_wmb(); /* The caller holds a reference to this object */ refcount_set(&ct->ct_general.use, 2); __nf_conntrack_hash_insert(ct, hash, reply_hash); nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert); local_bh_enable(); return 0; chaintoolong: NF_CT_STAT_INC(net, chaintoolong); err = -ENOSPC; out: nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); return err; } EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets, unsigned int bytes) { struct nf_conn_acct *acct; acct = nf_conn_acct_find(ct); if (acct) { struct nf_conn_counter *counter = acct->counter; atomic64_add(packets, &counter[dir].packets); atomic64_add(bytes, &counter[dir].bytes); } } EXPORT_SYMBOL_GPL(nf_ct_acct_add); static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct nf_conn *loser_ct) { struct nf_conn_acct *acct; acct = nf_conn_acct_find(loser_ct); if (acct) { struct nf_conn_counter *counter = acct->counter; unsigned int bytes; /* u32 should be fine since we must have seen one packet. */ bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes); nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes); } } static void __nf_conntrack_insert_prepare(struct nf_conn *ct) { struct nf_conn_tstamp *tstamp; refcount_inc(&ct->ct_general.use); /* set conntrack timestamp, if enabled. */ tstamp = nf_conn_tstamp_find(ct); if (tstamp) tstamp->start = ktime_get_real_ns(); } /* caller must hold locks to prevent concurrent changes */ static int __nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h) { /* This is the conntrack entry already in hashes that won race. */ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); enum ip_conntrack_info ctinfo; struct nf_conn *loser_ct; loser_ct = nf_ct_get(skb, &ctinfo); if (nf_ct_is_dying(ct)) return NF_DROP; if (((ct->status & IPS_NAT_DONE_MASK) == 0) || nf_ct_match(ct, loser_ct)) { struct net *net = nf_ct_net(ct); nf_conntrack_get(&ct->ct_general); nf_ct_acct_merge(ct, ctinfo, loser_ct); nf_ct_put(loser_ct); nf_ct_set(skb, ct, ctinfo); NF_CT_STAT_INC(net, clash_resolve); return NF_ACCEPT; } return NF_DROP; } /** * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry * * @skb: skb that causes the collision * @repl_idx: hash slot for reply direction * * Called when origin or reply direction had a clash. * The skb can be handled without packet drop provided the reply direction * is unique or there the existing entry has the identical tuple in both * directions. * * Caller must hold conntrack table locks to prevent concurrent updates. * * Returns NF_DROP if the clash could not be handled. */ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) { struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb); const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; struct net *net; zone = nf_ct_zone(loser_ct); net = nf_ct_net(loser_ct); /* Reply direction must never result in a clash, unless both origin * and reply tuples are identical. */ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) { if (nf_ct_key_equal(h, &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) return __nf_ct_resolve_clash(skb, h); } /* We want the clashing entry to go away real soon: 1 second timeout. */ WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ); /* IPS_NAT_CLASH removes the entry automatically on the first * reply. Also prevents UDP tracker from moving the entry to * ASSURED state, i.e. the entry can always be evicted under * pressure. */ loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH; __nf_conntrack_insert_prepare(loser_ct); /* fake add for ORIGINAL dir: we want lookups to only find the entry * already in the table. This also hides the clashing entry from * ctnetlink iteration, i.e. conntrack -L won't show them. */ hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &nf_conntrack_hash[repl_idx]); NF_CT_STAT_INC(net, clash_resolve); return NF_ACCEPT; } /** * nf_ct_resolve_clash - attempt to handle clash without packet drop * * @skb: skb that causes the clash * @h: tuplehash of the clashing entry already in table * @reply_hash: hash slot for reply direction * * A conntrack entry can be inserted to the connection tracking table * if there is no existing entry with an identical tuple. * * If there is one, @skb (and the assocated, unconfirmed conntrack) has * to be dropped. In case @skb is retransmitted, next conntrack lookup * will find the already-existing entry. * * The major problem with such packet drop is the extra delay added by * the packet loss -- it will take some time for a retransmit to occur * (or the sender to time out when waiting for a reply). * * This function attempts to handle the situation without packet drop. * * If @skb has no NAT transformation or if the colliding entries are * exactly the same, only the to-be-confirmed conntrack entry is discarded * and @skb is associated with the conntrack entry already in the table. * * Failing that, the new, unconfirmed conntrack is still added to the table * provided that the collision only occurs in the ORIGINAL direction. * The new entry will be added only in the non-clashing REPLY direction, * so packets in the ORIGINAL direction will continue to match the existing * entry. The new entry will also have a fixed timeout so it expires -- * due to the collision, it will only see reply traffic. * * Returns NF_DROP if the clash could not be resolved. */ static __cold noinline int nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h, u32 reply_hash) { /* This is the conntrack entry already in hashes that won race. */ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); const struct nf_conntrack_l4proto *l4proto; enum ip_conntrack_info ctinfo; struct nf_conn *loser_ct; struct net *net; int ret; loser_ct = nf_ct_get(skb, &ctinfo); net = nf_ct_net(loser_ct); l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); if (!l4proto->allow_clash) goto drop; ret = __nf_ct_resolve_clash(skb, h); if (ret == NF_ACCEPT) return ret; ret = nf_ct_resolve_clash_harder(skb, reply_hash); if (ret == NF_ACCEPT) return ret; drop: NF_CT_STAT_INC(net, drop); NF_CT_STAT_INC(net, insert_failed); return NF_DROP; } /* Confirm a connection given skb; places it in hash table */ int __nf_conntrack_confirm(struct sk_buff *skb) { unsigned int chainlen = 0, sequence, max_chainlen; const struct nf_conntrack_zone *zone; unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; struct nf_conn_help *help; struct hlist_nulls_node *n; enum ip_conntrack_info ctinfo; struct net *net; int ret = NF_DROP; ct = nf_ct_get(skb, &ctinfo); net = nf_ct_net(ct); /* ipt_REJECT uses nf_conntrack_attach to attach related ICMP/TCP RST packets in other direction. Actual packet which created connection will be IP_CT_NEW or for an expected connection, IP_CT_RELATED. */ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) return NF_ACCEPT; zone = nf_ct_zone(ct); local_bh_disable(); do { sequence = read_seqcount_begin(&nf_conntrack_generation); /* reuse the hash saved before */ hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; hash = scale_hash(hash); reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); /* We're not in hash table, and we refuse to set up related * connections for unconfirmed conns. But packet copies and * REJECT will give spurious warnings here. */ /* Another skb with the same unconfirmed conntrack may * win the race. This may happen for bridge(br_flood) * or broadcast/multicast packets do skb_clone with * unconfirmed conntrack. */ if (unlikely(nf_ct_is_confirmed(ct))) { WARN_ON_ONCE(1); nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); return NF_DROP; } if (!nf_ct_ext_valid_pre(ct->ext)) { NF_CT_STAT_INC(net, insert_failed); goto dying; } pr_debug("Confirming conntrack %p\n", ct); /* We have to check the DYING flag after unlink to prevent * a race against nf_ct_get_next_corpse() possibly called from * user context, else we insert an already 'dead' hash, blocking * further use of that particular connection -JM. */ ct->status |= IPS_CONFIRMED; if (unlikely(nf_ct_is_dying(ct))) { NF_CT_STAT_INC(net, insert_failed); goto dying; } max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN); /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, zone, net)) goto out; if (chainlen++ > max_chainlen) goto chaintoolong; } chainlen = 0; hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) goto out; if (chainlen++ > max_chainlen) { chaintoolong: NF_CT_STAT_INC(net, chaintoolong); NF_CT_STAT_INC(net, insert_failed); ret = NF_DROP; goto dying; } } /* Timer relative to confirmation time, not original setting time, otherwise we'd get timer wrap in weird delay cases. */ ct->timeout += nfct_time_stamp; __nf_conntrack_insert_prepare(ct); /* Since the lookup is lockless, hash insertion must be done after * starting the timer and setting the CONFIRMED bit. The RCU barriers * guarantee that no other CPU can find the conntrack before the above * stores are visible. */ __nf_conntrack_hash_insert(ct, hash, reply_hash); nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); /* ext area is still valid (rcu read lock is held, * but will go out of scope soon, we need to remove * this conntrack again. */ if (!nf_ct_ext_valid_post(ct->ext)) { nf_ct_kill(ct); NF_CT_STAT_INC_ATOMIC(net, drop); return NF_DROP; } help = nfct_help(ct); if (help && help->helper) nf_conntrack_event_cache(IPCT_HELPER, ct); nf_conntrack_event_cache(master_ct(ct) ? IPCT_RELATED : IPCT_NEW, ct); return NF_ACCEPT; out: ret = nf_ct_resolve_clash(skb, h, reply_hash); dying: nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); return ret; } EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); /* Returns true if a connection correspondings to the tuple (required for NAT). */ int nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack) { struct net *net = nf_ct_net(ignored_conntrack); const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_head *ct_hash; unsigned int hash, hsize; struct hlist_nulls_node *n; struct nf_conn *ct; zone = nf_ct_zone(ignored_conntrack); rcu_read_lock(); begin: nf_conntrack_get_ht(&ct_hash, &hsize); hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize); hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (ct == ignored_conntrack) continue; if (nf_ct_is_expired(ct)) { nf_ct_gc_expired(ct); continue; } if (nf_ct_key_equal(h, tuple, zone, net)) { /* Tuple is taken already, so caller will need to find * a new source port to use. * * Only exception: * If the *original tuples* are identical, then both * conntracks refer to the same flow. * This is a rare situation, it can occur e.g. when * more than one UDP packet is sent from same socket * in different threads. * * Let nf_ct_resolve_clash() deal with this later. */ if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) continue; NF_CT_STAT_INC_ATOMIC(net, found); rcu_read_unlock(); return 1; } } if (get_nulls_value(n) != hash) { NF_CT_STAT_INC_ATOMIC(net, search_restart); goto begin; } rcu_read_unlock(); return 0; } EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); #define NF_CT_EVICTION_RANGE 8 /* There's a small race here where we may free a just-assured connection. Too bad: we're in trouble anyway. */ static unsigned int early_drop_list(struct net *net, struct hlist_nulls_head *head) { struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; unsigned int drops = 0; struct nf_conn *tmp; hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { tmp = nf_ct_tuplehash_to_ctrack(h); if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) continue; if (nf_ct_is_expired(tmp)) { nf_ct_gc_expired(tmp); continue; } if (test_bit(IPS_ASSURED_BIT, &tmp->status) || !net_eq(nf_ct_net(tmp), net) || nf_ct_is_dying(tmp)) continue; if (!refcount_inc_not_zero(&tmp->ct_general.use)) continue; /* load ->ct_net and ->status after refcount increase */ smp_acquire__after_ctrl_dep(); /* kill only if still in same netns -- might have moved due to * SLAB_TYPESAFE_BY_RCU rules. * * We steal the timer reference. If that fails timer has * already fired or someone else deleted it. Just drop ref * and move to next entry. */ if (net_eq(nf_ct_net(tmp), net) && nf_ct_is_confirmed(tmp) && nf_ct_delete(tmp, 0, 0)) drops++; nf_ct_put(tmp); } return drops; } static noinline int early_drop(struct net *net, unsigned int hash) { unsigned int i, bucket; for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { struct hlist_nulls_head *ct_hash; unsigned int hsize, drops; rcu_read_lock(); nf_conntrack_get_ht(&ct_hash, &hsize); if (!i) bucket = reciprocal_scale(hash, hsize); else bucket = (bucket + 1) % hsize; drops = early_drop_list(net, &ct_hash[bucket]); rcu_read_unlock(); if (drops) { NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops); return true; } } return false; } static bool gc_worker_skip_ct(const struct nf_conn *ct) { return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct); } static bool gc_worker_can_early_drop(const struct nf_conn *ct) { const struct nf_conntrack_l4proto *l4proto; if (!test_bit(IPS_ASSURED_BIT, &ct->status)) return true; l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); if (l4proto->can_early_drop && l4proto->can_early_drop(ct)) return true; return false; } static void gc_worker(struct work_struct *work) { unsigned int i, hashsz, nf_conntrack_max95 = 0; u32 end_time, start_time = nfct_time_stamp; struct conntrack_gc_work *gc_work; unsigned int expired_count = 0; unsigned long next_run; s32 delta_time; long count; gc_work = container_of(work, struct conntrack_gc_work, dwork.work); i = gc_work->next_bucket; if (gc_work->early_drop) nf_conntrack_max95 = nf_conntrack_max / 100u * 95u; if (i == 0) { gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT; gc_work->count = GC_SCAN_INITIAL_COUNT; gc_work->start_time = start_time; } next_run = gc_work->avg_timeout; count = gc_work->count; end_time = start_time + GC_SCAN_MAX_DURATION; do { struct nf_conntrack_tuple_hash *h; struct hlist_nulls_head *ct_hash; struct hlist_nulls_node *n; struct nf_conn *tmp; rcu_read_lock(); nf_conntrack_get_ht(&ct_hash, &hashsz); if (i >= hashsz) { rcu_read_unlock(); break; } hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { struct nf_conntrack_net *cnet; struct net *net; long expires; tmp = nf_ct_tuplehash_to_ctrack(h); if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { nf_ct_offload_timeout(tmp); continue; } if (expired_count > GC_SCAN_EXPIRED_MAX) { rcu_read_unlock(); gc_work->next_bucket = i; gc_work->avg_timeout = next_run; gc_work->count = count; delta_time = nfct_time_stamp - gc_work->start_time; /* re-sched immediately if total cycle time is exceeded */ next_run = delta_time < (s32)GC_SCAN_INTERVAL_MAX; goto early_exit; } if (nf_ct_is_expired(tmp)) { nf_ct_gc_expired(tmp); expired_count++; continue; } expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP); expires = (expires - (long)next_run) / ++count; next_run += expires; if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp)) continue; net = nf_ct_net(tmp); cnet = nf_ct_pernet(net); if (atomic_read(&cnet->count) < nf_conntrack_max95) continue; /* need to take reference to avoid possible races */ if (!refcount_inc_not_zero(&tmp->ct_general.use)) continue; /* load ->status after refcount increase */ smp_acquire__after_ctrl_dep(); if (gc_worker_skip_ct(tmp)) { nf_ct_put(tmp); continue; } if (gc_worker_can_early_drop(tmp)) { nf_ct_kill(tmp); expired_count++; } nf_ct_put(tmp); } /* could check get_nulls_value() here and restart if ct * was moved to another chain. But given gc is best-effort * we will just continue with next hash slot. */ rcu_read_unlock(); cond_resched(); i++; delta_time = nfct_time_stamp - end_time; if (delta_time > 0 && i < hashsz) { gc_work->avg_timeout = next_run; gc_work->count = count; gc_work->next_bucket = i; next_run = 0; goto early_exit; } } while (i < hashsz); gc_work->next_bucket = 0; next_run = clamp(next_run, GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_MAX); delta_time = max_t(s32, nfct_time_stamp - gc_work->start_time, 1); if (next_run > (unsigned long)delta_time) next_run -= delta_time; else next_run = 1; early_exit: if (gc_work->exiting) return; if (next_run) gc_work->early_drop = false; queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run); } static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) { INIT_DELAYED_WORK(&gc_work->dwork, gc_worker); gc_work->exiting = false; } static struct nf_conn * __nf_conntrack_alloc(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, gfp_t gfp, u32 hash) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); unsigned int ct_count; struct nf_conn *ct; /* We don't want any race condition at early drop stage */ ct_count = atomic_inc_return(&cnet->count); if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { if (!early_drop(net, hash)) { if (!conntrack_gc_work.early_drop) conntrack_gc_work.early_drop = true; atomic_dec(&cnet->count); net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); return ERR_PTR(-ENOMEM); } } /* * Do not use kmem_cache_zalloc(), as this cache uses * SLAB_TYPESAFE_BY_RCU. */ ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); if (ct == NULL) goto out; spin_lock_init(&ct->lock); ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; /* save hash for reusing when confirming */ *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; ct->status = 0; WRITE_ONCE(ct->timeout, 0); write_pnet(&ct->ct_net, net); memset_after(ct, 0, __nfct_init_offset); nf_ct_zone_add(ct, zone); trace_android_rvh_nf_conn_alloc(ct); /* Because we use RCU lookups, we set ct_general.use to zero before * this is inserted in any list. */ refcount_set(&ct->ct_general.use, 0); return ct; out: atomic_dec(&cnet->count); return ERR_PTR(-ENOMEM); } struct nf_conn *nf_conntrack_alloc(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, gfp_t gfp) { return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0); } EXPORT_SYMBOL_GPL(nf_conntrack_alloc); void nf_conntrack_free(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); struct nf_conntrack_net *cnet; /* A freed object has refcnt == 0, that's * the golden rule for SLAB_TYPESAFE_BY_RCU */ WARN_ON(refcount_read(&ct->ct_general.use) != 0); if (ct->status & IPS_SRC_NAT_DONE) { const struct nf_nat_hook *nat_hook; rcu_read_lock(); nat_hook = rcu_dereference(nf_nat_hook); if (nat_hook) nat_hook->remove_nat_bysrc(ct); rcu_read_unlock(); } kfree(ct->ext); kmem_cache_free(nf_conntrack_cachep, ct); cnet = nf_ct_pernet(net); smp_mb__before_atomic(); trace_android_rvh_nf_conn_free(ct); atomic_dec(&cnet->count); } EXPORT_SYMBOL_GPL(nf_conntrack_free); /* Allocate a new conntrack: we return -ENOMEM if classification failed due to stress. Otherwise it really is unclassifiable. */ static noinline struct nf_conntrack_tuple_hash * init_conntrack(struct net *net, struct nf_conn *tmpl, const struct nf_conntrack_tuple *tuple, struct sk_buff *skb, unsigned int dataoff, u32 hash) { struct nf_conn *ct; struct nf_conn_help *help; struct nf_conntrack_tuple repl_tuple; #ifdef CONFIG_NF_CONNTRACK_EVENTS struct nf_conntrack_ecache *ecache; #endif struct nf_conntrack_expect *exp = NULL; const struct nf_conntrack_zone *zone; struct nf_conn_timeout *timeout_ext; struct nf_conntrack_zone tmp; struct nf_conntrack_net *cnet; if (!nf_ct_invert_tuple(&repl_tuple, tuple)) { pr_debug("Can't invert tuple.\n"); return NULL; } zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, hash); if (IS_ERR(ct)) return (struct nf_conntrack_tuple_hash *)ct; if (!nf_ct_add_synproxy(ct, tmpl)) { nf_conntrack_free(ct); return ERR_PTR(-ENOMEM); } timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; if (timeout_ext) nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout), GFP_ATOMIC); nf_ct_acct_ext_add(ct, GFP_ATOMIC); nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); nf_ct_labels_ext_add(ct); #ifdef CONFIG_NF_CONNTRACK_EVENTS ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; if ((ecache || net->ct.sysctl_events) && !nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, ecache ? ecache->expmask : 0, GFP_ATOMIC)) { nf_conntrack_free(ct); return ERR_PTR(-ENOMEM); } #endif cnet = nf_ct_pernet(net); if (cnet->expect_count) { spin_lock_bh(&nf_conntrack_expect_lock); exp = nf_ct_find_expectation(net, zone, tuple); if (exp) { pr_debug("expectation arrives ct=%p exp=%p\n", ct, exp); /* Welcome, Mr. Bond. We've been expecting you... */ __set_bit(IPS_EXPECTED_BIT, &ct->status); /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ ct->master = exp->master; if (exp->helper) { help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (help) rcu_assign_pointer(help->helper, exp->helper); } #ifdef CONFIG_NF_CONNTRACK_MARK ct->mark = READ_ONCE(exp->master->mark); #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK ct->secmark = exp->master->secmark; #endif NF_CT_STAT_INC(net, expect_new); } spin_unlock_bh(&nf_conntrack_expect_lock); } if (!exp && tmpl) __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); /* Other CPU might have obtained a pointer to this object before it was * released. Because refcount is 0, refcount_inc_not_zero() will fail. * * After refcount_set(1) it will succeed; ensure that zeroing of * ct->status and the correct ct->net pointer are visible; else other * core might observe CONFIRMED bit which means the entry is valid and * in the hash table, but its not (anymore). */ smp_wmb(); /* Now it is going to be associated with an sk_buff, set refcount to 1. */ refcount_set(&ct->ct_general.use, 1); if (exp) { if (exp->expectfn) exp->expectfn(ct, exp); nf_ct_expect_put(exp); } return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; } /* On success, returns 0, sets skb->_nfct | ctinfo */ static int resolve_normal_ct(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, u_int8_t protonum, const struct nf_hook_state *state) { const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_hash *h; enum ip_conntrack_info ctinfo; struct nf_conntrack_zone tmp; u32 hash, zone_id, rid; struct nf_conn *ct; if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, state->pf, protonum, state->net, &tuple)) { pr_debug("Can't get tuple\n"); return 0; } /* look for tuple match */ zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); hash = hash_conntrack_raw(&tuple, zone_id, state->net); h = __nf_conntrack_find_get(state->net, zone, &tuple, hash); if (!h) { rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); if (zone_id != rid) { u32 tmp = hash_conntrack_raw(&tuple, rid, state->net); h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp); } } if (!h) { h = init_conntrack(state->net, tmpl, &tuple, skb, dataoff, hash); if (!h) return 0; if (IS_ERR(h)) return PTR_ERR(h); } ct = nf_ct_tuplehash_to_ctrack(h); /* It exists; we have (non-exclusive) reference. */ if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { ctinfo = IP_CT_ESTABLISHED_REPLY; } else { /* Once we've had two way comms, always ESTABLISHED. */ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { pr_debug("normal packet for %p\n", ct); ctinfo = IP_CT_ESTABLISHED; } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { pr_debug("related packet for %p\n", ct); ctinfo = IP_CT_RELATED; } else { pr_debug("new packet for %p\n", ct); ctinfo = IP_CT_NEW; } } nf_ct_set(skb, ct, ctinfo); return 0; } /* * icmp packets need special treatment to handle error messages that are * related to a connection. * * Callers need to check if skb has a conntrack assigned when this * helper returns; in such case skb belongs to an already known connection. */ static unsigned int __cold nf_conntrack_handle_icmp(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, u8 protonum, const struct nf_hook_state *state) { int ret; if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP) ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state); #if IS_ENABLED(CONFIG_IPV6) else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6) ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state); #endif else return NF_ACCEPT; if (ret <= 0) NF_CT_STAT_INC_ATOMIC(state->net, error); return ret; } static int generic_packet(struct nf_conn *ct, struct sk_buff *skb, enum ip_conntrack_info ctinfo) { const unsigned int *timeout = nf_ct_timeout_lookup(ct); if (!timeout) timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout; nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; } /* Returns verdict for packet, or -1 for invalid. */ static int nf_conntrack_handle_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { switch (nf_ct_protonum(ct)) { case IPPROTO_TCP: return nf_conntrack_tcp_packet(ct, skb, dataoff, ctinfo, state); case IPPROTO_UDP: return nf_conntrack_udp_packet(ct, skb, dataoff, ctinfo, state); case IPPROTO_ICMP: return nf_conntrack_icmp_packet(ct, skb, ctinfo, state); #if IS_ENABLED(CONFIG_IPV6) case IPPROTO_ICMPV6: return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state); #endif #ifdef CONFIG_NF_CT_PROTO_UDPLITE case IPPROTO_UDPLITE: return nf_conntrack_udplite_packet(ct, skb, dataoff, ctinfo, state); #endif #ifdef CONFIG_NF_CT_PROTO_SCTP case IPPROTO_SCTP: return nf_conntrack_sctp_packet(ct, skb, dataoff, ctinfo, state); #endif #ifdef CONFIG_NF_CT_PROTO_DCCP case IPPROTO_DCCP: return nf_conntrack_dccp_packet(ct, skb, dataoff, ctinfo, state); #endif #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: return nf_conntrack_gre_packet(ct, skb, dataoff, ctinfo, state); #endif } return generic_packet(ct, skb, ctinfo); } unsigned int nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) { enum ip_conntrack_info ctinfo; struct nf_conn *ct, *tmpl; u_int8_t protonum; int dataoff, ret; tmpl = nf_ct_get(skb, &ctinfo); if (tmpl || ctinfo == IP_CT_UNTRACKED) { /* Previously seen (loopback or untracked)? Ignore. */ if ((tmpl && !nf_ct_is_template(tmpl)) || ctinfo == IP_CT_UNTRACKED) return NF_ACCEPT; skb->_nfct = 0; } /* rcu_read_lock()ed by nf_hook_thresh */ dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum); if (dataoff <= 0) { pr_debug("not prepared to track yet or error occurred\n"); NF_CT_STAT_INC_ATOMIC(state->net, invalid); ret = NF_ACCEPT; goto out; } if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) { ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff, protonum, state); if (ret <= 0) { ret = -ret; goto out; } /* ICMP[v6] protocol trackers may assign one conntrack. */ if (skb->_nfct) goto out; } repeat: ret = resolve_normal_ct(tmpl, skb, dataoff, protonum, state); if (ret < 0) { /* Too stressed to deal. */ NF_CT_STAT_INC_ATOMIC(state->net, drop); ret = NF_DROP; goto out; } ct = nf_ct_get(skb, &ctinfo); if (!ct) { /* Not valid part of a connection */ NF_CT_STAT_INC_ATOMIC(state->net, invalid); ret = NF_ACCEPT; goto out; } ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state); if (ret <= 0) { /* Invalid: inverse of the return code tells * the netfilter core what to do */ pr_debug("nf_conntrack_in: Can't track with proto module\n"); nf_ct_put(ct); skb->_nfct = 0; /* Special case: TCP tracker reports an attempt to reopen a * closed/aborted connection. We have to go back and create a * fresh conntrack. */ if (ret == -NF_REPEAT) goto repeat; NF_CT_STAT_INC_ATOMIC(state->net, invalid); if (ret == -NF_DROP) NF_CT_STAT_INC_ATOMIC(state->net, drop); ret = -ret; goto out; } if (ctinfo == IP_CT_ESTABLISHED_REPLY && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_REPLY, ct); out: if (tmpl) nf_ct_put(tmpl); return ret; } EXPORT_SYMBOL_GPL(nf_conntrack_in); /* Alter reply tuple (maybe alter helper). This is for NAT, and is implicitly racy: see __nf_conntrack_confirm */ void nf_conntrack_alter_reply(struct nf_conn *ct, const struct nf_conntrack_tuple *newreply) { struct nf_conn_help *help = nfct_help(ct); /* Should be unconfirmed, so not in hash table yet */ WARN_ON(nf_ct_is_confirmed(ct)); pr_debug("Altering reply tuple of %p to ", ct); nf_ct_dump_tuple(newreply); ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; if (ct->master || (help && !hlist_empty(&help->expectations))) return; } EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ void __nf_ct_refresh_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct sk_buff *skb, u32 extra_jiffies, bool do_acct) { /* Only update if this is not a fixed timeout */ if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) goto acct; /* If not in hash table, timer will not be active yet */ if (nf_ct_is_confirmed(ct)) extra_jiffies += nfct_time_stamp; if (READ_ONCE(ct->timeout) != extra_jiffies) WRITE_ONCE(ct->timeout, extra_jiffies); acct: if (do_acct) nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); } EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); bool nf_ct_kill_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct sk_buff *skb) { nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); return nf_ct_delete(ct, 0, 0); } EXPORT_SYMBOL_GPL(nf_ct_kill_acct); #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> #include <linux/mutex.h> /* Generic function for tcp/udp/sctp/dccp and alike. */ int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) || nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port)) goto nla_put_failure; return 0; nla_put_failure: return -1; } EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr); const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 }, [CTA_PROTO_DST_PORT] = { .type = NLA_U16 }, }; EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *t, u_int32_t flags) { if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) { if (!tb[CTA_PROTO_SRC_PORT]) return -EINVAL; t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); } if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) { if (!tb[CTA_PROTO_DST_PORT]) return -EINVAL; t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); } return 0; } EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); unsigned int nf_ct_port_nlattr_tuple_size(void) { static unsigned int size __read_mostly; if (!size) size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); return size; } EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); #endif /* Used by ipt_REJECT and ip6t_REJECT. */ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; /* This ICMP is in reverse direction to the packet which caused it */ ct = nf_ct_get(skb, &ctinfo); if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) ctinfo = IP_CT_RELATED_REPLY; else ctinfo = IP_CT_RELATED; /* Attach to new skbuff, and increment count */ nf_ct_set(nskb, ct, ctinfo); nf_conntrack_get(skb_nfct(nskb)); } static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { const struct nf_nat_hook *nat_hook; struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; unsigned int status; int dataoff; u16 l3num; u8 l4num; l3num = nf_ct_l3num(ct); dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num); if (dataoff <= 0) return -1; if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, l4num, net, &tuple)) return -1; if (ct->status & IPS_SRC_NAT) { memcpy(tuple.src.u3.all, ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all, sizeof(tuple.src.u3.all)); tuple.src.u.all = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all; } if (ct->status & IPS_DST_NAT) { memcpy(tuple.dst.u3.all, ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all, sizeof(tuple.dst.u3.all)); tuple.dst.u.all = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all; } h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple); if (!h) return 0; /* Store status bits of the conntrack that is clashing to re-do NAT * mangling according to what it has been done already to this packet. */ status = ct->status; nf_ct_put(ct); ct = nf_ct_tuplehash_to_ctrack(h); nf_ct_set(skb, ct, ctinfo); nat_hook = rcu_dereference(nf_nat_hook); if (!nat_hook) return 0; if (status & IPS_SRC_NAT && nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC, IP_CT_DIR_ORIGINAL) == NF_DROP) return -1; if (status & IPS_DST_NAT && nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST, IP_CT_DIR_ORIGINAL) == NF_DROP) return -1; return 0; } /* This packet is coming from userspace via nf_queue, complete the packet * processing after the helper invocation in nf_confirm(). */ static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { const struct nf_conntrack_helper *helper; const struct nf_conn_help *help; int protoff; help = nfct_help(ct); if (!help) return 0; helper = rcu_dereference(help->helper); if (!helper) return 0; if (!(helper->flags & NF_CT_HELPER_F_USERSPACE)) return 0; switch (nf_ct_l3num(ct)) { case NFPROTO_IPV4: protoff = skb_network_offset(skb) + ip_hdrlen(skb); break; #if IS_ENABLED(CONFIG_IPV6) case NFPROTO_IPV6: { __be16 frag_off; u8 pnum; pnum = ipv6_hdr(skb)->nexthdr; protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, &frag_off); if (protoff < 0 || (frag_off & htons(~0x7)) != 0) return 0; break; } #endif default: return 0; } if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_is_loopback_packet(skb)) { if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); return -1; } } /* We've seen it coming out the other side: confirm it */ return nf_conntrack_confirm(skb) == NF_DROP ? - 1 : 0; } static int nf_conntrack_update(struct net *net, struct sk_buff *skb) { enum ip_conntrack_info ctinfo; struct nf_conn *ct; int err; ct = nf_ct_get(skb, &ctinfo); if (!ct) return 0; if (!nf_ct_is_confirmed(ct)) { err = __nf_conntrack_update(net, skb, ct, ctinfo); if (err < 0) return err; ct = nf_ct_get(skb, &ctinfo); } return nf_confirm_cthelper(skb, ct, ctinfo); } static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb) { const struct nf_conntrack_tuple *src_tuple; const struct nf_conntrack_tuple_hash *hash; struct nf_conntrack_tuple srctuple; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); if (ct) { src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); return true; } if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), NFPROTO_IPV4, dev_net(skb->dev), &srctuple)) return false; hash = nf_conntrack_find_get(dev_net(skb->dev), &nf_ct_zone_dflt, &srctuple); if (!hash) return false; ct = nf_ct_tuplehash_to_ctrack(hash); src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); nf_ct_put(ct); return true; } /* Bring out ya dead! */ static struct nf_conn * get_next_corpse(int (*iter)(struct nf_conn *i, void *data), const struct nf_ct_iter_data *iter_data, unsigned int *bucket) { struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; struct hlist_nulls_node *n; spinlock_t *lockp; for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket]; if (hlist_nulls_empty(hslot)) continue; lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; local_bh_disable(); nf_conntrack_lock(lockp); hlist_nulls_for_each_entry(h, n, hslot, hnnode) { if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) continue; /* All nf_conn objects are added to hash table twice, one * for original direction tuple, once for the reply tuple. * * Exception: In the IPS_NAT_CLASH case, only the reply * tuple is added (the original tuple already existed for * a different object). * * We only need to call the iterator once for each * conntrack, so we just use the 'reply' direction * tuple while iterating. */ ct = nf_ct_tuplehash_to_ctrack(h); if (iter_data->net && !net_eq(iter_data->net, nf_ct_net(ct))) continue; if (iter(ct, iter_data->data)) goto found; } spin_unlock(lockp); local_bh_enable(); cond_resched(); } return NULL; found: refcount_inc(&ct->ct_general.use); spin_unlock(lockp); local_bh_enable(); return ct; } static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), const struct nf_ct_iter_data *iter_data) { unsigned int bucket = 0; struct nf_conn *ct; might_sleep(); mutex_lock(&nf_conntrack_mutex); while ((ct = get_next_corpse(iter, iter_data, &bucket)) != NULL) { /* Time to push up daises... */ nf_ct_delete(ct, iter_data->portid, iter_data->report); nf_ct_put(ct); cond_resched(); } mutex_unlock(&nf_conntrack_mutex); } void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data), const struct nf_ct_iter_data *iter_data) { struct net *net = iter_data->net; struct nf_conntrack_net *cnet = nf_ct_pernet(net); might_sleep(); if (atomic_read(&cnet->count) == 0) return; nf_ct_iterate_cleanup(iter, iter_data); } EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net); /** * nf_ct_iterate_destroy - destroy unconfirmed conntracks and iterate table * @iter: callback to invoke for each conntrack * @data: data to pass to @iter * * Like nf_ct_iterate_cleanup, but first marks conntracks on the * unconfirmed list as dying (so they will not be inserted into * main table). * * Can only be called in module exit path. */ void nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) { struct nf_ct_iter_data iter_data = {}; struct net *net; down_read(&net_rwsem); for_each_net(net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); if (atomic_read(&cnet->count) == 0) continue; nf_queue_nf_hook_drop(net); } up_read(&net_rwsem); /* Need to wait for netns cleanup worker to finish, if its * running -- it might have deleted a net namespace from * the global list, so hook drop above might not have * affected all namespaces. */ net_ns_barrier(); /* a skb w. unconfirmed conntrack could have been reinjected just * before we called nf_queue_nf_hook_drop(). * * This makes sure its inserted into conntrack table. */ synchronize_net(); nf_ct_ext_bump_genid(); iter_data.data = data; nf_ct_iterate_cleanup(iter, &iter_data); /* Another cpu might be in a rcu read section with * rcu protected pointer cleared in iter callback * or hidden via nf_ct_ext_bump_genid() above. * * Wait until those are done. */ synchronize_rcu(); } EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy); static int kill_all(struct nf_conn *i, void *data) { return 1; } void nf_conntrack_cleanup_start(void) { cleanup_nf_conntrack_bpf(); conntrack_gc_work.exiting = true; } void nf_conntrack_cleanup_end(void) { RCU_INIT_POINTER(nf_ct_hook, NULL); cancel_delayed_work_sync(&conntrack_gc_work.dwork); kvfree(nf_conntrack_hash); nf_conntrack_proto_fini(); nf_conntrack_helper_fini(); nf_conntrack_expect_fini(); kmem_cache_destroy(nf_conntrack_cachep); } /* * Mishearing the voices in his head, our hero wonders how he's * supposed to kill the mall. */ void nf_conntrack_cleanup_net(struct net *net) { LIST_HEAD(single); list_add(&net->exit_list, &single); nf_conntrack_cleanup_net_list(&single); } void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) { struct nf_ct_iter_data iter_data = {}; struct net *net; int busy; /* * This makes sure all current packets have passed through * netfilter framework. Roll on, two-stage module * delete... */ synchronize_net(); i_see_dead_people: busy = 0; list_for_each_entry(net, net_exit_list, exit_list) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); iter_data.net = net; nf_ct_iterate_cleanup_net(kill_all, &iter_data); if (atomic_read(&cnet->count) != 0) busy = 1; } if (busy) { schedule(); goto i_see_dead_people; } list_for_each_entry(net, net_exit_list, exit_list) { nf_conntrack_ecache_pernet_fini(net); nf_conntrack_expect_pernet_fini(net); free_percpu(net->ct.stat); } } void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) { struct hlist_nulls_head *hash; unsigned int nr_slots, i; if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) return NULL; BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL); if (hash && nulls) for (i = 0; i < nr_slots; i++) INIT_HLIST_NULLS_HEAD(&hash[i], i); return hash; } EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); int nf_conntrack_hash_resize(unsigned int hashsize) { int i, bucket; unsigned int old_size; struct hlist_nulls_head *hash, *old_hash; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; if (!hashsize) return -EINVAL; hash = nf_ct_alloc_hashtable(&hashsize, 1); if (!hash) return -ENOMEM; mutex_lock(&nf_conntrack_mutex); old_size = nf_conntrack_htable_size; if (old_size == hashsize) { mutex_unlock(&nf_conntrack_mutex); kvfree(hash); return 0; } local_bh_disable(); nf_conntrack_all_lock(); write_seqcount_begin(&nf_conntrack_generation); /* Lookups in the old hash might happen in parallel, which means we * might get false negatives during connection lookup. New connections * created because of a false negative won't make it into the hash * though since that required taking the locks. */ for (i = 0; i < nf_conntrack_htable_size; i++) { while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { unsigned int zone_id; h = hlist_nulls_entry(nf_conntrack_hash[i].first, struct nf_conntrack_tuple_hash, hnnode); ct = nf_ct_tuplehash_to_ctrack(h); hlist_nulls_del_rcu(&h->hnnode); zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h)); bucket = __hash_conntrack(nf_ct_net(ct), &h->tuple, zone_id, hashsize); hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); } } old_hash = nf_conntrack_hash; nf_conntrack_hash = hash; nf_conntrack_htable_size = hashsize; write_seqcount_end(&nf_conntrack_generation); nf_conntrack_all_unlock(); local_bh_enable(); mutex_unlock(&nf_conntrack_mutex); synchronize_net(); kvfree(old_hash); return 0; } int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) { unsigned int hashsize; int rc; if (current->nsproxy->net_ns != &init_net) return -EOPNOTSUPP; /* On boot, we can set this without any fancy locking. */ if (!nf_conntrack_hash) return param_set_uint(val, kp); rc = kstrtouint(val, 0, &hashsize); if (rc) return rc; return nf_conntrack_hash_resize(hashsize); } int nf_conntrack_init_start(void) { unsigned long nr_pages = totalram_pages(); int max_factor = 8; int ret = -ENOMEM; int i; seqcount_spinlock_init(&nf_conntrack_generation, &nf_conntrack_locks_all_lock); for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_conntrack_locks[i]); if (!nf_conntrack_htable_size) { nf_conntrack_htable_size = (((nr_pages << PAGE_SHIFT) / 16384) / sizeof(struct hlist_head)); if (BITS_PER_LONG >= 64 && nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) nf_conntrack_htable_size = 262144; else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) nf_conntrack_htable_size = 65536; if (nf_conntrack_htable_size < 1024) nf_conntrack_htable_size = 1024; /* Use a max. factor of one by default to keep the average * hash chain length at 2 entries. Each entry has to be added * twice (once for original direction, once for reply). * When a table size is given we use the old value of 8 to * avoid implicit reduction of the max entries setting. */ max_factor = 1; } nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); if (!nf_conntrack_hash) return -ENOMEM; nf_conntrack_max = max_factor * nf_conntrack_htable_size; nf_conntrack_cachep = kmem_cache_create("nf_conntrack", sizeof(struct nf_conn), NFCT_INFOMASK + 1, SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); if (!nf_conntrack_cachep) goto err_cachep; ret = nf_conntrack_expect_init(); if (ret < 0) goto err_expect; ret = nf_conntrack_helper_init(); if (ret < 0) goto err_helper; ret = nf_conntrack_proto_init(); if (ret < 0) goto err_proto; conntrack_gc_work_init(&conntrack_gc_work); queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ); ret = register_nf_conntrack_bpf(); if (ret < 0) goto err_kfunc; return 0; err_kfunc: cancel_delayed_work_sync(&conntrack_gc_work.dwork); nf_conntrack_proto_fini(); err_proto: nf_conntrack_helper_fini(); err_helper: nf_conntrack_expect_fini(); err_expect: kmem_cache_destroy(nf_conntrack_cachep); err_cachep: kvfree(nf_conntrack_hash); return ret; } static void nf_conntrack_set_closing(struct nf_conntrack *nfct) { struct nf_conn *ct = nf_ct_to_nf_conn(nfct); switch (nf_ct_protonum(ct)) { case IPPROTO_TCP: nf_conntrack_tcp_set_closing(ct); break; } } static const struct nf_ct_hook nf_conntrack_hook = { .update = nf_conntrack_update, .destroy = nf_ct_destroy, .get_tuple_skb = nf_conntrack_get_tuple_skb, .attach = nf_conntrack_attach, .set_closing = nf_conntrack_set_closing, .confirm = __nf_conntrack_confirm, }; void nf_conntrack_init_end(void) { RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook); } /* * We need to use special "null" values, not used in hash table */ #define UNCONFIRMED_NULLS_VAL ((1<<30)+0) int nf_conntrack_init_net(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); int ret = -ENOMEM; BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER); BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS); atomic_set(&cnet->count, 0); net->ct.stat = alloc_percpu(struct ip_conntrack_stat); if (!net->ct.stat) return ret; ret = nf_conntrack_expect_pernet_init(net); if (ret < 0) goto err_expect; nf_conntrack_acct_pernet_init(net); nf_conntrack_tstamp_pernet_init(net); nf_conntrack_ecache_pernet_init(net); nf_conntrack_proto_pernet_init(net); return 0; err_expect: free_percpu(net->ct.stat); return ret; } /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ int __nf_ct_change_timeout(struct nf_conn *ct, u64 timeout) { if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) return -EPERM; __nf_ct_set_timeout(ct, timeout); if (test_bit(IPS_DYING_BIT, &ct->status)) return -ETIME; return 0; } EXPORT_SYMBOL_GPL(__nf_ct_change_timeout); void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off) { unsigned int bit; /* Ignore these unchangable bits */ on &= ~IPS_UNCHANGEABLE_MASK; off &= ~IPS_UNCHANGEABLE_MASK; for (bit = 0; bit < __IPS_MAX_BIT; bit++) { if (on & (1 << bit)) set_bit(bit, &ct->status); else if (off & (1 << bit)) clear_bit(bit, &ct->status); } } EXPORT_SYMBOL_GPL(__nf_ct_change_status); int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status) { unsigned long d; d = ct->status ^ status; if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) /* unchangeable */ return -EBUSY; if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) /* SEEN_REPLY bit can only be set */ return -EBUSY; if (d & IPS_ASSURED && !(status & IPS_ASSURED)) /* ASSURED bit can only be set */ return -EBUSY; __nf_ct_change_status(ct, status, 0); return 0; } EXPORT_SYMBOL_GPL(nf_ct_change_status_common); |
65 193 193 49 559 654 654 49 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * 9P Client Definitions * * Copyright (C) 2008 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net> */ #ifndef NET_9P_CLIENT_H #define NET_9P_CLIENT_H #include <linux/utsname.h> #include <linux/idr.h> #include <linux/tracepoint-defs.h> /* Number of requests per row */ #define P9_ROW_MAXTAG 255 /** enum p9_proto_versions - 9P protocol versions * @p9_proto_legacy: 9P Legacy mode, pre-9P2000.u * @p9_proto_2000u: 9P2000.u extension * @p9_proto_2000L: 9P2000.L extension */ enum p9_proto_versions { p9_proto_legacy, p9_proto_2000u, p9_proto_2000L, }; /** * enum p9_trans_status - different states of underlying transports * @Connected: transport is connected and healthy * @Disconnected: transport has been disconnected * @Hung: transport is connected by wedged * * This enumeration details the various states a transport * instatiation can be in. */ enum p9_trans_status { Connected, BeginDisconnect, Disconnected, Hung, }; /** * enum p9_req_status_t - status of a request * @REQ_STATUS_ALLOC: request has been allocated but not sent * @REQ_STATUS_UNSENT: request waiting to be sent * @REQ_STATUS_SENT: request sent to server * @REQ_STATUS_RCVD: response received from server * @REQ_STATUS_FLSHD: request has been flushed * @REQ_STATUS_ERROR: request encountered an error on the client side */ enum p9_req_status_t { REQ_STATUS_ALLOC, REQ_STATUS_UNSENT, REQ_STATUS_SENT, REQ_STATUS_RCVD, REQ_STATUS_FLSHD, REQ_STATUS_ERROR, }; /** * struct p9_req_t - request slots * @status: status of this request slot * @t_err: transport error * @wq: wait_queue for the client to block on for this request * @tc: the request fcall structure * @rc: the response fcall structure * @req_list: link for higher level objects to chain requests */ struct p9_req_t { int status; int t_err; refcount_t refcount; wait_queue_head_t wq; struct p9_fcall tc; struct p9_fcall rc; struct list_head req_list; }; /** * struct p9_client - per client instance state * @lock: protect @fids and @reqs * @msize: maximum data size negotiated by protocol * @proto_version: 9P protocol version to use * @trans_mod: module API instantiated with this client * @status: connection state * @trans: tranport instance state and API * @fids: All active FID handles * @reqs: All active requests. * @name: node name used as client id * * The client structure is used to keep track of various per-client * state that has been instantiated. */ struct p9_client { spinlock_t lock; unsigned int msize; unsigned char proto_version; struct p9_trans_module *trans_mod; enum p9_trans_status status; void *trans; struct kmem_cache *fcall_cache; union { struct { int rfd; int wfd; } fd; struct { u16 port; bool privport; } tcp; } trans_opts; struct idr fids; struct idr reqs; char name[__NEW_UTS_LEN + 1]; }; /** * struct p9_fid - file system entity handle * @clnt: back pointer to instantiating &p9_client * @fid: numeric identifier for this handle * @mode: current mode of this fid (enum?) * @qid: the &p9_qid server identifier this handle points to * @iounit: the server reported maximum transaction size for this file * @uid: the numeric uid of the local user who owns this handle * @rdir: readdir accounting structure (allocated on demand) * @dlist: per-dentry fid tracking * * TODO: This needs lots of explanation. */ enum fid_source { FID_FROM_OTHER, FID_FROM_INODE, FID_FROM_DENTRY, }; struct p9_fid { struct p9_client *clnt; u32 fid; refcount_t count; int mode; struct p9_qid qid; u32 iounit; kuid_t uid; void *rdir; struct hlist_node dlist; /* list of all fids attached to a dentry */ struct hlist_node ilist; }; /** * struct p9_dirent - directory entry structure * @qid: The p9 server qid for this dirent * @d_off: offset to the next dirent * @d_type: type of file * @d_name: file name */ struct p9_dirent { struct p9_qid qid; u64 d_off; unsigned char d_type; char d_name[256]; }; struct iov_iter; int p9_show_client_options(struct seq_file *m, struct p9_client *clnt); int p9_client_statfs(struct p9_fid *fid, struct p9_rstatfs *sb); int p9_client_rename(struct p9_fid *fid, struct p9_fid *newdirfid, const char *name); int p9_client_renameat(struct p9_fid *olddirfid, const char *old_name, struct p9_fid *newdirfid, const char *new_name); struct p9_client *p9_client_create(const char *dev_name, char *options); void p9_client_destroy(struct p9_client *clnt); void p9_client_disconnect(struct p9_client *clnt); void p9_client_begin_disconnect(struct p9_client *clnt); struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid, const char *uname, kuid_t n_uname, const char *aname); struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname, const unsigned char * const *wnames, int clone); int p9_client_open(struct p9_fid *fid, int mode); int p9_client_fcreate(struct p9_fid *fid, const char *name, u32 perm, int mode, char *extension); int p9_client_link(struct p9_fid *fid, struct p9_fid *oldfid, const char *newname); int p9_client_symlink(struct p9_fid *fid, const char *name, const char *symname, kgid_t gid, struct p9_qid *qid); int p9_client_create_dotl(struct p9_fid *ofid, const char *name, u32 flags, u32 mode, kgid_t gid, struct p9_qid *qid); int p9_client_clunk(struct p9_fid *fid); int p9_client_fsync(struct p9_fid *fid, int datasync); int p9_client_remove(struct p9_fid *fid); int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags); int p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err); int p9_client_read_once(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err); int p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err); int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset); int p9dirent_read(struct p9_client *clnt, char *buf, int len, struct p9_dirent *dirent); struct p9_wstat *p9_client_stat(struct p9_fid *fid); int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst); int p9_client_setattr(struct p9_fid *fid, struct p9_iattr_dotl *attr); struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid, u64 request_mask); int p9_client_mknod_dotl(struct p9_fid *oldfid, const char *name, int mode, dev_t rdev, kgid_t gid, struct p9_qid *qid); int p9_client_mkdir_dotl(struct p9_fid *fid, const char *name, int mode, kgid_t gid, struct p9_qid *qid); int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status); int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *fl); void p9_fcall_fini(struct p9_fcall *fc); struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag); static inline void p9_req_get(struct p9_req_t *r) { refcount_inc(&r->refcount); } static inline int p9_req_try_get(struct p9_req_t *r) { return refcount_inc_not_zero(&r->refcount); } int p9_req_put(struct p9_client *c, struct p9_req_t *r); /* We cannot have the real tracepoints in header files, * use a wrapper function */ DECLARE_TRACEPOINT(9p_fid_ref); void do_trace_9p_fid_get(struct p9_fid *fid); void do_trace_9p_fid_put(struct p9_fid *fid); /* fid reference counting helpers: * - fids used for any length of time should always be referenced through * p9_fid_get(), and released with p9_fid_put() * - v9fs_fid_lookup() or similar will automatically call get for you * and also require a put * - the *_fid_add() helpers will stash the fid in the inode, * at which point it is the responsibility of evict_inode() * to call the put * - the last put will automatically send a clunk to the server */ static inline struct p9_fid *p9_fid_get(struct p9_fid *fid) { if (tracepoint_enabled(9p_fid_ref)) do_trace_9p_fid_get(fid); refcount_inc(&fid->count); return fid; } static inline int p9_fid_put(struct p9_fid *fid) { if (!fid || IS_ERR(fid)) return 0; if (tracepoint_enabled(9p_fid_ref)) do_trace_9p_fid_put(fid); if (!refcount_dec_and_test(&fid->count)) return 0; return p9_client_clunk(fid); } void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status); int p9_parse_header(struct p9_fcall *pdu, int32_t *size, int8_t *type, int16_t *tag, int rewind); int p9stat_read(struct p9_client *clnt, char *buf, int len, struct p9_wstat *st); void p9stat_free(struct p9_wstat *stbuf); int p9_is_proto_dotu(struct p9_client *clnt); int p9_is_proto_dotl(struct p9_client *clnt); struct p9_fid *p9_client_xattrwalk(struct p9_fid *file_fid, const char *attr_name, u64 *attr_size); int p9_client_xattrcreate(struct p9_fid *fid, const char *name, u64 attr_size, int flags); int p9_client_readlink(struct p9_fid *fid, char **target); int p9_client_init(void); void p9_client_exit(void); #endif /* NET_9P_CLIENT_H */ |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_SMT_H #define _LINUX_SCHED_SMT_H #include <linux/static_key.h> #ifdef CONFIG_SCHED_SMT extern struct static_key_false sched_smt_present; static __always_inline bool sched_smt_active(void) { return static_branch_likely(&sched_smt_present); } #else static inline bool sched_smt_active(void) { return false; } #endif void arch_smt_update(void); #endif |
13 6778 6781 29876 197 3621 6796 6796 29730 311 29730 9881 23654 3268 24480 422 24480 6772 6772 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Variant of atomic_t specialized for reference counts. * * The interface matches the atomic_t interface (to aid in porting) but only * provides the few functions one should use for reference counting. * * Saturation semantics * ==================== * * refcount_t differs from atomic_t in that the counter saturates at * REFCOUNT_SATURATED and will not move once there. This avoids wrapping the * counter and causing 'spurious' use-after-free issues. In order to avoid the * cost associated with introducing cmpxchg() loops into all of the saturating * operations, we temporarily allow the counter to take on an unchecked value * and then explicitly set it to REFCOUNT_SATURATED on detecting that underflow * or overflow has occurred. Although this is racy when multiple threads * access the refcount concurrently, by placing REFCOUNT_SATURATED roughly * equidistant from 0 and INT_MAX we minimise the scope for error: * * INT_MAX REFCOUNT_SATURATED UINT_MAX * 0 (0x7fff_ffff) (0xc000_0000) (0xffff_ffff) * +--------------------------------+----------------+----------------+ * <---------- bad value! ----------> * * (in a signed view of the world, the "bad value" range corresponds to * a negative counter value). * * As an example, consider a refcount_inc() operation that causes the counter * to overflow: * * int old = atomic_fetch_add_relaxed(r); * // old is INT_MAX, refcount now INT_MIN (0x8000_0000) * if (old < 0) * atomic_set(r, REFCOUNT_SATURATED); * * If another thread also performs a refcount_inc() operation between the two * atomic operations, then the count will continue to edge closer to 0. If it * reaches a value of 1 before /any/ of the threads reset it to the saturated * value, then a concurrent refcount_dec_and_test() may erroneously free the * underlying object. * Linux limits the maximum number of tasks to PID_MAX_LIMIT, which is currently * 0x400000 (and can't easily be raised in the future beyond FUTEX_TID_MASK). * With the current PID limit, if no batched refcounting operations are used and * the attacker can't repeatedly trigger kernel oopses in the middle of refcount * operations, this makes it impossible for a saturated refcount to leave the * saturation range, even if it is possible for multiple uses of the same * refcount to nest in the context of a single task: * * (UINT_MAX+1-REFCOUNT_SATURATED) / PID_MAX_LIMIT = * 0x40000000 / 0x400000 = 0x100 = 256 * * If hundreds of references are added/removed with a single refcounting * operation, it may potentially be possible to leave the saturation range; but * given the precise timing details involved with the round-robin scheduling of * each thread manipulating the refcount and the need to hit the race multiple * times in succession, there doesn't appear to be a practical avenue of attack * even if using refcount_add() operations with larger increments. * * Memory ordering * =============== * * Memory ordering rules are slightly relaxed wrt regular atomic_t functions * and provide only what is strictly required for refcounts. * * The increments are fully relaxed; these will not provide ordering. The * rationale is that whatever is used to obtain the object we're increasing the * reference count on will provide the ordering. For locked data structures, * its the lock acquire, for RCU/lockless data structures its the dependent * load. * * Do note that inc_not_zero() provides a control dependency which will order * future stores against the inc, this ensures we'll never modify the object * if we did not in fact acquire a reference. * * The decrements will provide release order, such that all the prior loads and * stores will be issued before, it also provides a control dependency, which * will order us against the subsequent free(). * * The control dependency is against the load of the cmpxchg (ll/sc) that * succeeded. This means the stores aren't fully ordered, but this is fine * because the 1->0 transition indicates no concurrency. * * Note that the allocator is responsible for ordering things between free() * and alloc(). * * The decrements dec_and_test() and sub_and_test() also provide acquire * ordering on success. * */ #ifndef _LINUX_REFCOUNT_H #define _LINUX_REFCOUNT_H #include <linux/atomic.h> #include <linux/bug.h> #include <linux/compiler.h> #include <linux/limits.h> #include <linux/spinlock_types.h> struct mutex; /** * typedef refcount_t - variant of atomic_t specialized for reference counts * @refs: atomic_t counter field * * The counter saturates at REFCOUNT_SATURATED and will not move once * there. This avoids wrapping the counter and causing 'spurious' * use-after-free bugs. */ typedef struct refcount_struct { atomic_t refs; } refcount_t; #define REFCOUNT_INIT(n) { .refs = ATOMIC_INIT(n), } #define REFCOUNT_MAX INT_MAX #define REFCOUNT_SATURATED (INT_MIN / 2) enum refcount_saturation_type { REFCOUNT_ADD_NOT_ZERO_OVF, REFCOUNT_ADD_OVF, REFCOUNT_ADD_UAF, REFCOUNT_SUB_UAF, REFCOUNT_DEC_LEAK, }; void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t); /** * refcount_set - set a refcount's value * @r: the refcount * @n: value to which the refcount will be set */ static inline void refcount_set(refcount_t *r, int n) { atomic_set(&r->refs, n); } /** * refcount_read - get a refcount's value * @r: the refcount * * Return: the refcount's value */ static inline unsigned int refcount_read(const refcount_t *r) { return atomic_read(&r->refs); } static inline __must_check bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp) { int old = refcount_read(r); do { if (!old) break; } while (!atomic_try_cmpxchg_relaxed(&r->refs, &old, old + i)); if (oldp) *oldp = old; if (unlikely(old < 0 || old + i < 0)) refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF); return old; } /** * refcount_add_not_zero - add a value to a refcount unless it is 0 * @i: the value to add to the refcount * @r: the refcount * * Will saturate at REFCOUNT_SATURATED and WARN. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See the comment on top. * * Use of this function is not recommended for the normal reference counting * use case in which references are taken and released one at a time. In these * cases, refcount_inc(), or one of its variants, should instead be used to * increment a reference count. * * Return: false if the passed refcount is 0, true otherwise */ static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) { return __refcount_add_not_zero(i, r, NULL); } static inline void __refcount_add(int i, refcount_t *r, int *oldp) { int old = atomic_fetch_add_relaxed(i, &r->refs); if (oldp) *oldp = old; if (unlikely(!old)) refcount_warn_saturate(r, REFCOUNT_ADD_UAF); else if (unlikely(old < 0 || old + i < 0)) refcount_warn_saturate(r, REFCOUNT_ADD_OVF); } /** * refcount_add - add a value to a refcount * @i: the value to add to the refcount * @r: the refcount * * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See the comment on top. * * Use of this function is not recommended for the normal reference counting * use case in which references are taken and released one at a time. In these * cases, refcount_inc(), or one of its variants, should instead be used to * increment a reference count. */ static inline void refcount_add(int i, refcount_t *r) { __refcount_add(i, r, NULL); } static inline __must_check bool __refcount_inc_not_zero(refcount_t *r, int *oldp) { return __refcount_add_not_zero(1, r, oldp); } /** * refcount_inc_not_zero - increment a refcount unless it is 0 * @r: the refcount to increment * * Similar to atomic_inc_not_zero(), but will saturate at REFCOUNT_SATURATED * and WARN. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See the comment on top. * * Return: true if the increment was successful, false otherwise */ static inline __must_check bool refcount_inc_not_zero(refcount_t *r) { return __refcount_inc_not_zero(r, NULL); } static inline void __refcount_inc(refcount_t *r, int *oldp) { __refcount_add(1, r, oldp); } /** * refcount_inc - increment a refcount * @r: the refcount to increment * * Similar to atomic_inc(), but will saturate at REFCOUNT_SATURATED and WARN. * * Provides no memory ordering, it is assumed the caller already has a * reference on the object. * * Will WARN if the refcount is 0, as this represents a possible use-after-free * condition. */ static inline void refcount_inc(refcount_t *r) { __refcount_inc(r, NULL); } static inline __must_check bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp) { int old = atomic_fetch_sub_release(i, &r->refs); if (oldp) *oldp = old; if (old == i) { smp_acquire__after_ctrl_dep(); return true; } if (unlikely(old < 0 || old - i < 0)) refcount_warn_saturate(r, REFCOUNT_SUB_UAF); return false; } /** * refcount_sub_and_test - subtract from a refcount and test if it is 0 * @i: amount to subtract from the refcount * @r: the refcount * * Similar to atomic_dec_and_test(), but it will WARN, return false and * ultimately leak on underflow and will fail to decrement when saturated * at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides an acquire ordering on success such that free() * must come after. * * Use of this function is not recommended for the normal reference counting * use case in which references are taken and released one at a time. In these * cases, refcount_dec(), or one of its variants, should instead be used to * decrement a reference count. * * Return: true if the resulting refcount is 0, false otherwise */ static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r) { return __refcount_sub_and_test(i, r, NULL); } static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp) { return __refcount_sub_and_test(1, r, oldp); } /** * refcount_dec_and_test - decrement a refcount and test if it is 0 * @r: the refcount * * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to * decrement when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides an acquire ordering on success such that free() * must come after. * * Return: true if the resulting refcount is 0, false otherwise */ static inline __must_check bool refcount_dec_and_test(refcount_t *r) { return __refcount_dec_and_test(r, NULL); } static inline void __refcount_dec(refcount_t *r, int *oldp) { int old = atomic_fetch_sub_release(1, &r->refs); if (oldp) *oldp = old; if (unlikely(old <= 1)) refcount_warn_saturate(r, REFCOUNT_DEC_LEAK); } /** * refcount_dec - decrement a refcount * @r: the refcount * * Similar to atomic_dec(), it will WARN on underflow and fail to decrement * when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before. */ static inline void refcount_dec(refcount_t *r) { __refcount_dec(r, NULL); } extern __must_check bool refcount_dec_if_one(refcount_t *r); extern __must_check bool refcount_dec_not_one(refcount_t *r); extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock) __cond_acquires(lock); extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) __cond_acquires(lock); extern __must_check bool refcount_dec_and_lock_irqsave(refcount_t *r, spinlock_t *lock, unsigned long *flags) __cond_acquires(lock); #endif /* _LINUX_REFCOUNT_H */ |
2890 2871 2875 3 358 1 22 153 97 163 10 61 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 | /* SPDX-License-Identifier: GPL-2.0 */ /* * fscrypt.h: declarations for per-file encryption * * Filesystems that implement per-file encryption must include this header * file. * * Copyright (C) 2015, Google, Inc. * * Written by Michael Halcrow, 2015. * Modified by Jaegeuk Kim, 2015. */ #ifndef _LINUX_FSCRYPT_H #define _LINUX_FSCRYPT_H #include <linux/fs.h> #include <linux/mm.h> #include <linux/slab.h> #include <uapi/linux/fscrypt.h> #include <linux/android_kabi.h> /* * The lengths of all file contents blocks must be divisible by this value. * This is needed to ensure that all contents encryption modes will work, as * some of the supported modes don't support arbitrarily byte-aligned messages. * * Since the needed alignment is 16 bytes, most filesystems will meet this * requirement naturally, as typical block sizes are powers of 2. However, if a * filesystem can generate arbitrarily byte-aligned block lengths (e.g., via * compression), then it will need to pad to this alignment before encryption. */ #define FSCRYPT_CONTENTS_ALIGNMENT 16 union fscrypt_policy; struct fscrypt_info; struct fs_parameter; struct seq_file; struct fscrypt_str { unsigned char *name; u32 len; }; struct fscrypt_name { const struct qstr *usr_fname; struct fscrypt_str disk_name; u32 hash; u32 minor_hash; struct fscrypt_str crypto_buf; bool is_nokey_name; }; #define FSTR_INIT(n, l) { .name = n, .len = l } #define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len) #define fname_name(p) ((p)->disk_name.name) #define fname_len(p) ((p)->disk_name.len) /* Maximum value for the third parameter of fscrypt_operations.set_context(). */ #define FSCRYPT_SET_CONTEXT_MAX_SIZE 40 #ifdef CONFIG_FS_ENCRYPTION /* * If set, the fscrypt bounce page pool won't be allocated (unless another * filesystem needs it). Set this if the filesystem always uses its own bounce * pages for writes and therefore won't need the fscrypt bounce page pool. */ #define FS_CFLG_OWN_PAGES (1U << 1) /* * If set, then fs/crypto/ will allow users to select a crypto data unit size * that is less than the filesystem block size. This is done via the * log2_data_unit_size field of the fscrypt policy. This flag is not compatible * with filesystems that encrypt variable-length blocks (i.e. blocks that aren't * all equal to filesystem's block size), for example as a result of * compression. It's also not compatible with the * fscrypt_encrypt_block_inplace() and fscrypt_decrypt_block_inplace() * functions. */ #define FS_CFLG_SUPPORTS_SUBBLOCK_DATA_UNITS (1U << 2) /* Crypto operations for filesystems */ struct fscrypt_operations { /* Set of optional flags; see above for allowed flags */ unsigned int flags; /* * If set, this is a filesystem-specific key description prefix that * will be accepted for "logon" keys for v1 fscrypt policies, in * addition to the generic prefix "fscrypt:". This functionality is * deprecated, so new filesystems shouldn't set this field. */ const char *key_prefix; /* * Get the fscrypt context of the given inode. * * @inode: the inode whose context to get * @ctx: the buffer into which to get the context * @len: length of the @ctx buffer in bytes * * Return: On success, returns the length of the context in bytes; this * may be less than @len. On failure, returns -ENODATA if the * inode doesn't have a context, -ERANGE if the context is * longer than @len, or another -errno code. */ int (*get_context)(struct inode *inode, void *ctx, size_t len); /* * Set an fscrypt context on the given inode. * * @inode: the inode whose context to set. The inode won't already have * an fscrypt context. * @ctx: the context to set * @len: length of @ctx in bytes (at most FSCRYPT_SET_CONTEXT_MAX_SIZE) * @fs_data: If called from fscrypt_set_context(), this will be the * value the filesystem passed to fscrypt_set_context(). * Otherwise (i.e. when called from * FS_IOC_SET_ENCRYPTION_POLICY) this will be NULL. * * i_rwsem will be held for write. * * Return: 0 on success, -errno on failure. */ int (*set_context)(struct inode *inode, const void *ctx, size_t len, void *fs_data); /* * Get the dummy fscrypt policy in use on the filesystem (if any). * * Filesystems only need to implement this function if they support the * test_dummy_encryption mount option. * * Return: A pointer to the dummy fscrypt policy, if the filesystem is * mounted with test_dummy_encryption; otherwise NULL. */ const union fscrypt_policy *(*get_dummy_policy)(struct super_block *sb); /* * Check whether a directory is empty. i_rwsem will be held for write. */ bool (*empty_dir)(struct inode *inode); /* * Check whether the filesystem's inode numbers and UUID are stable, * meaning that they will never be changed even by offline operations * such as filesystem shrinking and therefore can be used in the * encryption without the possibility of files becoming unreadable. * * Filesystems only need to implement this function if they want to * support the FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{32,64} flags. These * flags are designed to work around the limitations of UFS and eMMC * inline crypto hardware, and they shouldn't be used in scenarios where * such hardware isn't being used. * * Leaving this NULL is equivalent to always returning false. */ bool (*has_stable_inodes)(struct super_block *sb); /* * Get the number of bits that the filesystem uses to represent inode * numbers and file logical block numbers. * * By default, both of these are assumed to be 64-bit. This function * can be implemented to declare that either or both of these numbers is * shorter, which may allow the use of the * FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{32,64} flags and/or the use of * inline crypto hardware whose maximum DUN length is less than 64 bits * (e.g., eMMC v5.2 spec compliant hardware). This function only needs * to be implemented if support for one of these features is needed. */ void (*get_ino_and_lblk_bits)(struct super_block *sb, int *ino_bits_ret, int *lblk_bits_ret); /* * Return an array of pointers to the block devices to which the * filesystem may write encrypted file contents, NULL if the filesystem * only has a single such block device, or an ERR_PTR() on error. * * On successful non-NULL return, *num_devs is set to the number of * devices in the returned array. The caller must free the returned * array using kfree(). * * If the filesystem can use multiple block devices (other than block * devices that aren't used for encrypted file contents, such as * external journal devices), and wants to support inline encryption, * then it must implement this function. Otherwise it's not needed. */ struct block_device **(*get_devices)(struct super_block *sb, unsigned int *num_devs); ANDROID_KABI_RESERVE(1); ANDROID_KABI_RESERVE(2); ANDROID_KABI_RESERVE(3); ANDROID_KABI_RESERVE(4); ANDROID_OEM_DATA_ARRAY(1, 4); }; static inline struct fscrypt_info *fscrypt_get_info(const struct inode *inode) { /* * Pairs with the cmpxchg_release() in fscrypt_setup_encryption_info(). * I.e., another task may publish ->i_crypt_info concurrently, executing * a RELEASE barrier. We need to use smp_load_acquire() here to safely * ACQUIRE the memory the other task published. */ return smp_load_acquire(&inode->i_crypt_info); } /** * fscrypt_needs_contents_encryption() - check whether an inode needs * contents encryption * @inode: the inode to check * * Return: %true iff the inode is an encrypted regular file and the kernel was * built with fscrypt support. * * If you need to know whether the encrypt bit is set even when the kernel was * built without fscrypt support, you must use IS_ENCRYPTED() directly instead. */ static inline bool fscrypt_needs_contents_encryption(const struct inode *inode) { return IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode); } /* * When d_splice_alias() moves a directory's no-key alias to its plaintext alias * as a result of the encryption key being added, DCACHE_NOKEY_NAME must be * cleared. Note that we don't have to support arbitrary moves of this flag * because fscrypt doesn't allow no-key names to be the source or target of a * rename(). */ static inline void fscrypt_handle_d_move(struct dentry *dentry) { dentry->d_flags &= ~DCACHE_NOKEY_NAME; } /** * fscrypt_is_nokey_name() - test whether a dentry is a no-key name * @dentry: the dentry to check * * This returns true if the dentry is a no-key dentry. A no-key dentry is a * dentry that was created in an encrypted directory that hasn't had its * encryption key added yet. Such dentries may be either positive or negative. * * When a filesystem is asked to create a new filename in an encrypted directory * and the new filename's dentry is a no-key dentry, it must fail the operation * with ENOKEY. This includes ->create(), ->mkdir(), ->mknod(), ->symlink(), * ->rename(), and ->link(). (However, ->rename() and ->link() are already * handled by fscrypt_prepare_rename() and fscrypt_prepare_link().) * * This is necessary because creating a filename requires the directory's * encryption key, but just checking for the key on the directory inode during * the final filesystem operation doesn't guarantee that the key was available * during the preceding dentry lookup. And the key must have already been * available during the dentry lookup in order for it to have been checked * whether the filename already exists in the directory and for the new file's * dentry not to be invalidated due to it incorrectly having the no-key flag. * * Return: %true if the dentry is a no-key name */ static inline bool fscrypt_is_nokey_name(const struct dentry *dentry) { return dentry->d_flags & DCACHE_NOKEY_NAME; } /* crypto.c */ void fscrypt_enqueue_decrypt_work(struct work_struct *); struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, unsigned int len, unsigned int offs, gfp_t gfp_flags); int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num, gfp_t gfp_flags); int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs); int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num); static inline bool fscrypt_is_bounce_page(struct page *page) { return page->mapping == NULL; } static inline struct page *fscrypt_pagecache_page(struct page *bounce_page) { return (struct page *)page_private(bounce_page); } void fscrypt_free_bounce_page(struct page *bounce_page); /* policy.c */ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg); int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg); int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg); int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg); int fscrypt_has_permitted_context(struct inode *parent, struct inode *child); int fscrypt_context_for_new_inode(void *ctx, struct inode *inode); int fscrypt_set_context(struct inode *inode, void *fs_data); struct fscrypt_dummy_policy { const union fscrypt_policy *policy; }; int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param, struct fscrypt_dummy_policy *dummy_policy); bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1, const struct fscrypt_dummy_policy *p2); void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, struct super_block *sb); static inline bool fscrypt_is_dummy_policy_set(const struct fscrypt_dummy_policy *dummy_policy) { return dummy_policy->policy != NULL; } static inline void fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy) { kfree(dummy_policy->policy); dummy_policy->policy = NULL; } /* keyring.c */ void fscrypt_destroy_keyring(struct super_block *sb); int fscrypt_ioctl_add_key(struct file *filp, void __user *arg); int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg); int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *arg); int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg); /* keysetup.c */ int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, bool *encrypt_ret); void fscrypt_put_encryption_info(struct inode *inode); void fscrypt_free_inode(struct inode *inode); int fscrypt_drop_inode(struct inode *inode); /* fname.c */ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, u8 *out, unsigned int olen); bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, u32 max_len, u32 *encrypted_len_ret); int fscrypt_setup_filename(struct inode *inode, const struct qstr *iname, int lookup, struct fscrypt_name *fname); static inline void fscrypt_free_filename(struct fscrypt_name *fname) { kfree(fname->crypto_buf.name); } int fscrypt_fname_alloc_buffer(u32 max_encrypted_len, struct fscrypt_str *crypto_str); void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str); int fscrypt_fname_disk_to_usr(const struct inode *inode, u32 hash, u32 minor_hash, const struct fscrypt_str *iname, struct fscrypt_str *oname); bool fscrypt_match_name(const struct fscrypt_name *fname, const u8 *de_name, u32 de_name_len); u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name); int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags); /* bio.c */ bool fscrypt_decrypt_bio(struct bio *bio); int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len); /* hooks.c */ int fscrypt_file_open(struct inode *inode, struct file *filp); int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, struct dentry *dentry); int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, struct fscrypt_name *fname); int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry); int __fscrypt_prepare_readdir(struct inode *dir); int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr); int fscrypt_prepare_setflags(struct inode *inode, unsigned int oldflags, unsigned int flags); int fscrypt_prepare_symlink(struct inode *dir, const char *target, unsigned int len, unsigned int max_len, struct fscrypt_str *disk_link); int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link); const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, unsigned int max_size, struct delayed_call *done); int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat); static inline void fscrypt_set_ops(struct super_block *sb, const struct fscrypt_operations *s_cop) { sb->s_cop = s_cop; } #else /* !CONFIG_FS_ENCRYPTION */ static inline struct fscrypt_info *fscrypt_get_info(const struct inode *inode) { return NULL; } static inline bool fscrypt_needs_contents_encryption(const struct inode *inode) { return false; } static inline void fscrypt_handle_d_move(struct dentry *dentry) { } static inline bool fscrypt_is_nokey_name(const struct dentry *dentry) { return false; } /* crypto.c */ static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work) { } static inline struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, unsigned int len, unsigned int offs, gfp_t gfp_flags) { return ERR_PTR(-EOPNOTSUPP); } static inline int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num, gfp_t gfp_flags) { return -EOPNOTSUPP; } static inline int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs) { return -EOPNOTSUPP; } static inline int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num) { return -EOPNOTSUPP; } static inline bool fscrypt_is_bounce_page(struct page *page) { return false; } static inline struct page *fscrypt_pagecache_page(struct page *bounce_page) { WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); } static inline void fscrypt_free_bounce_page(struct page *bounce_page) { } /* policy.c */ static inline int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { return 0; } static inline int fscrypt_set_context(struct inode *inode, void *fs_data) { return -EOPNOTSUPP; } struct fscrypt_dummy_policy { }; static inline int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param, struct fscrypt_dummy_policy *dummy_policy) { return -EINVAL; } static inline bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1, const struct fscrypt_dummy_policy *p2) { return true; } static inline void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, struct super_block *sb) { } static inline bool fscrypt_is_dummy_policy_set(const struct fscrypt_dummy_policy *dummy_policy) { return false; } static inline void fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy) { } /* keyring.c */ static inline void fscrypt_destroy_keyring(struct super_block *sb) { } static inline int fscrypt_ioctl_add_key(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } /* keysetup.c */ static inline int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, bool *encrypt_ret) { if (IS_ENCRYPTED(dir)) return -EOPNOTSUPP; return 0; } static inline void fscrypt_put_encryption_info(struct inode *inode) { return; } static inline void fscrypt_free_inode(struct inode *inode) { } static inline int fscrypt_drop_inode(struct inode *inode) { return 0; } /* fname.c */ static inline int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct fscrypt_name *fname) { if (IS_ENCRYPTED(dir)) return -EOPNOTSUPP; memset(fname, 0, sizeof(*fname)); fname->usr_fname = iname; fname->disk_name.name = (unsigned char *)iname->name; fname->disk_name.len = iname->len; return 0; } static inline void fscrypt_free_filename(struct fscrypt_name *fname) { return; } static inline int fscrypt_fname_alloc_buffer(u32 max_encrypted_len, struct fscrypt_str *crypto_str) { return -EOPNOTSUPP; } static inline void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) { return; } static inline int fscrypt_fname_disk_to_usr(const struct inode *inode, u32 hash, u32 minor_hash, const struct fscrypt_str *iname, struct fscrypt_str *oname) { return -EOPNOTSUPP; } static inline bool fscrypt_match_name(const struct fscrypt_name *fname, const u8 *de_name, u32 de_name_len) { /* Encryption support disabled; use standard comparison */ if (de_name_len != fname->disk_name.len) return false; return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len); } static inline u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name) { WARN_ON_ONCE(1); return 0; } static inline int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) { return 1; } /* bio.c */ static inline bool fscrypt_decrypt_bio(struct bio *bio) { return true; } static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len) { return -EOPNOTSUPP; } /* hooks.c */ static inline int fscrypt_file_open(struct inode *inode, struct file *filp) { if (IS_ENCRYPTED(inode)) return -EOPNOTSUPP; return 0; } static inline int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, struct dentry *dentry) { return -EOPNOTSUPP; } static inline int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { return -EOPNOTSUPP; } static inline int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, struct fscrypt_name *fname) { return -EOPNOTSUPP; } static inline int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry) { return -EOPNOTSUPP; } static inline int __fscrypt_prepare_readdir(struct inode *dir) { return -EOPNOTSUPP; } static inline int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr) { return -EOPNOTSUPP; } static inline int fscrypt_prepare_setflags(struct inode *inode, unsigned int oldflags, unsigned int flags) { return 0; } static inline int fscrypt_prepare_symlink(struct inode *dir, const char *target, unsigned int len, unsigned int max_len, struct fscrypt_str *disk_link) { if (IS_ENCRYPTED(dir)) return -EOPNOTSUPP; disk_link->name = (unsigned char *)target; disk_link->len = len + 1; if (disk_link->len > max_len) return -ENAMETOOLONG; return 0; } static inline int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link) { return -EOPNOTSUPP; } static inline const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, unsigned int max_size, struct delayed_call *done) { return ERR_PTR(-EOPNOTSUPP); } static inline int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat) { return -EOPNOTSUPP; } static inline void fscrypt_set_ops(struct super_block *sb, const struct fscrypt_operations *s_cop) { } #endif /* !CONFIG_FS_ENCRYPTION */ /* inline_crypt.c */ #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode); void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, u64 first_lblk, gfp_t gfp_mask); void fscrypt_set_bio_crypt_ctx_bh(struct bio *bio, const struct buffer_head *first_bh, gfp_t gfp_mask); bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk); bool fscrypt_mergeable_bio_bh(struct bio *bio, const struct buffer_head *next_bh); bool fscrypt_dio_supported(struct inode *inode); u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks); #else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ static inline bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode) { return false; } static inline void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, u64 first_lblk, gfp_t gfp_mask) { } static inline void fscrypt_set_bio_crypt_ctx_bh( struct bio *bio, const struct buffer_head *first_bh, gfp_t gfp_mask) { } static inline bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk) { return true; } static inline bool fscrypt_mergeable_bio_bh(struct bio *bio, const struct buffer_head *next_bh) { return true; } static inline bool fscrypt_dio_supported(struct inode *inode) { return !fscrypt_needs_contents_encryption(inode); } static inline u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks) { return nr_blocks; } #endif /* !CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ #if IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_DM_DEFAULT_KEY) static inline bool fscrypt_inode_should_skip_dm_default_key(const struct inode *inode) { return IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode); } #else static inline bool fscrypt_inode_should_skip_dm_default_key(const struct inode *inode) { return false; } #endif /** * fscrypt_inode_uses_inline_crypto() - test whether an inode uses inline * encryption * @inode: an inode. If encrypted, its key must be set up. * * Return: true if the inode requires file contents encryption and if the * encryption should be done in the block layer via blk-crypto rather * than in the filesystem layer. */ static inline bool fscrypt_inode_uses_inline_crypto(const struct inode *inode) { return fscrypt_needs_contents_encryption(inode) && __fscrypt_inode_uses_inline_crypto(inode); } /** * fscrypt_inode_uses_fs_layer_crypto() - test whether an inode uses fs-layer * encryption * @inode: an inode. If encrypted, its key must be set up. * * Return: true if the inode requires file contents encryption and if the * encryption should be done in the filesystem layer rather than in the * block layer via blk-crypto. */ static inline bool fscrypt_inode_uses_fs_layer_crypto(const struct inode *inode) { return fscrypt_needs_contents_encryption(inode) && !__fscrypt_inode_uses_inline_crypto(inode); } /** * fscrypt_has_encryption_key() - check whether an inode has had its key set up * @inode: the inode to check * * Return: %true if the inode has had its encryption key set up, else %false. * * Usually this should be preceded by fscrypt_get_encryption_info() to try to * set up the key first. */ static inline bool fscrypt_has_encryption_key(const struct inode *inode) { return fscrypt_get_info(inode) != NULL; } /** * fscrypt_prepare_link() - prepare to link an inode into a possibly-encrypted * directory * @old_dentry: an existing dentry for the inode being linked * @dir: the target directory * @dentry: negative dentry for the target filename * * A new link can only be added to an encrypted directory if the directory's * encryption key is available --- since otherwise we'd have no way to encrypt * the filename. * * We also verify that the link will not violate the constraint that all files * in an encrypted directory tree use the same encryption policy. * * Return: 0 on success, -ENOKEY if the directory's encryption key is missing, * -EXDEV if the link would result in an inconsistent encryption policy, or * another -errno code. */ static inline int fscrypt_prepare_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { if (IS_ENCRYPTED(dir)) return __fscrypt_prepare_link(d_inode(old_dentry), dir, dentry); return 0; } /** * fscrypt_prepare_rename() - prepare for a rename between possibly-encrypted * directories * @old_dir: source directory * @old_dentry: dentry for source file * @new_dir: target directory * @new_dentry: dentry for target location (may be negative unless exchanging) * @flags: rename flags (we care at least about %RENAME_EXCHANGE) * * Prepare for ->rename() where the source and/or target directories may be * encrypted. A new link can only be added to an encrypted directory if the * directory's encryption key is available --- since otherwise we'd have no way * to encrypt the filename. A rename to an existing name, on the other hand, * *is* cryptographically possible without the key. However, we take the more * conservative approach and just forbid all no-key renames. * * We also verify that the rename will not violate the constraint that all files * in an encrypted directory tree use the same encryption policy. * * Return: 0 on success, -ENOKEY if an encryption key is missing, -EXDEV if the * rename would cause inconsistent encryption policies, or another -errno code. */ static inline int fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { if (IS_ENCRYPTED(old_dir) || IS_ENCRYPTED(new_dir)) return __fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry, flags); return 0; } /** * fscrypt_prepare_lookup() - prepare to lookup a name in a possibly-encrypted * directory * @dir: directory being searched * @dentry: filename being looked up * @fname: (output) the name to use to search the on-disk directory * * Prepare for ->lookup() in a directory which may be encrypted by determining * the name that will actually be used to search the directory on-disk. If the * directory's encryption policy is supported by this kernel and its encryption * key is available, then the lookup is assumed to be by plaintext name; * otherwise, it is assumed to be by no-key name. * * This will set DCACHE_NOKEY_NAME on the dentry if the lookup is by no-key * name. In this case the filesystem must assign the dentry a dentry_operations * which contains fscrypt_d_revalidate (or contains a d_revalidate method that * calls fscrypt_d_revalidate), so that the dentry will be invalidated if the * directory's encryption key is later added. * * Return: 0 on success; -ENOENT if the directory's key is unavailable but the * filename isn't a valid no-key name, so a negative dentry should be created; * or another -errno code. */ static inline int fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, struct fscrypt_name *fname) { if (IS_ENCRYPTED(dir)) return __fscrypt_prepare_lookup(dir, dentry, fname); memset(fname, 0, sizeof(*fname)); fname->usr_fname = &dentry->d_name; fname->disk_name.name = (unsigned char *)dentry->d_name.name; fname->disk_name.len = dentry->d_name.len; return 0; } /** * fscrypt_prepare_readdir() - prepare to read a possibly-encrypted directory * @dir: the directory inode * * If the directory is encrypted and it doesn't already have its encryption key * set up, try to set it up so that the filenames will be listed in plaintext * form rather than in no-key form. * * Return: 0 on success; -errno on error. Note that the encryption key being * unavailable is not considered an error. It is also not an error if * the encryption policy is unsupported by this kernel; that is treated * like the key being unavailable, so that files can still be deleted. */ static inline int fscrypt_prepare_readdir(struct inode *dir) { if (IS_ENCRYPTED(dir)) return __fscrypt_prepare_readdir(dir); return 0; } /** * fscrypt_prepare_setattr() - prepare to change a possibly-encrypted inode's * attributes * @dentry: dentry through which the inode is being changed * @attr: attributes to change * * Prepare for ->setattr() on a possibly-encrypted inode. On an encrypted file, * most attribute changes are allowed even without the encryption key. However, * without the encryption key we do have to forbid truncates. This is needed * because the size being truncated to may not be a multiple of the filesystem * block size, and in that case we'd have to decrypt the final block, zero the * portion past i_size, and re-encrypt it. (We *could* allow truncating to a * filesystem block boundary, but it's simpler to just forbid all truncates --- * and we already forbid all other contents modifications without the key.) * * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code * if a problem occurred while setting up the encryption key. */ static inline int fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr) { if (IS_ENCRYPTED(d_inode(dentry))) return __fscrypt_prepare_setattr(dentry, attr); return 0; } /** * fscrypt_encrypt_symlink() - encrypt the symlink target if needed * @inode: symlink inode * @target: plaintext symlink target * @len: length of @target excluding null terminator * @disk_link: (in/out) the on-disk symlink target being prepared * * If the symlink target needs to be encrypted, then this function encrypts it * into @disk_link->name. fscrypt_prepare_symlink() must have been called * previously to compute @disk_link->len. If the filesystem did not allocate a * buffer for @disk_link->name after calling fscrypt_prepare_link(), then one * will be kmalloc()'ed and the filesystem will be responsible for freeing it. * * Return: 0 on success, -errno on failure */ static inline int fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link) { if (IS_ENCRYPTED(inode)) return __fscrypt_encrypt_symlink(inode, target, len, disk_link); return 0; } /* If *pagep is a bounce page, free it and set *pagep to the pagecache page */ static inline void fscrypt_finalize_bounce_page(struct page **pagep) { struct page *page = *pagep; if (fscrypt_is_bounce_page(page)) { *pagep = fscrypt_pagecache_page(page); fscrypt_free_bounce_page(page); } } #endif /* _LINUX_FSCRYPT_H */ |
628 186 572 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _TRACE_SYSCALL_H #define _TRACE_SYSCALL_H #include <linux/tracepoint.h> #include <linux/unistd.h> #include <linux/trace_events.h> #include <linux/thread_info.h> #include <asm/ptrace.h> /* * A syscall entry in the ftrace syscalls array. * * @name: name of the syscall * @syscall_nr: number of the syscall * @nb_args: number of parameters it takes * @types: list of types as strings * @args: list of args as strings (args[i] matches types[i]) * @enter_fields: list of fields for syscall_enter trace event * @enter_event: associated syscall_enter trace event * @exit_event: associated syscall_exit trace event */ struct syscall_metadata { const char *name; int syscall_nr; int nb_args; const char **types; const char **args; struct list_head enter_fields; struct trace_event_call *enter_event; struct trace_event_call *exit_event; }; #if defined(CONFIG_TRACEPOINTS) && defined(CONFIG_HAVE_SYSCALL_TRACEPOINTS) static inline void syscall_tracepoint_update(struct task_struct *p) { if (test_syscall_work(SYSCALL_TRACEPOINT)) set_task_syscall_work(p, SYSCALL_TRACEPOINT); else clear_task_syscall_work(p, SYSCALL_TRACEPOINT); } #else static inline void syscall_tracepoint_update(struct task_struct *p) { } #endif #endif /* _TRACE_SYSCALL_H */ |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2010 Daniel Mack <daniel@caiaq.de> * * This file holds USB constants and structures defined * by the USB Device Class Definition for Audio Devices in version 2.0. * Comments below reference relevant sections of the documents contained * in http://www.usb.org/developers/devclass_docs/Audio2.0_final.zip */ #ifndef __LINUX_USB_AUDIO_V2_H #define __LINUX_USB_AUDIO_V2_H #include <linux/types.h> /* v1.0 and v2.0 of this standard have many things in common. For the rest * of the definitions, please refer to audio.h */ /* * bmControl field decoders * * From the USB Audio spec v2.0: * * bmaControls() is a (ch+1)-element array of 4-byte bitmaps, * each containing a set of bit pairs. If a Control is present, * it must be Host readable. If a certain Control is not * present then the bit pair must be set to 0b00. * If a Control is present but read-only, the bit pair must be * set to 0b01. If a Control is also Host programmable, the bit * pair must be set to 0b11. The value 0b10 is not allowed. * */ static inline bool uac_v2v3_control_is_readable(u32 bmControls, u8 control) { return (bmControls >> ((control - 1) * 2)) & 0x1; } static inline bool uac_v2v3_control_is_writeable(u32 bmControls, u8 control) { return (bmControls >> ((control - 1) * 2)) & 0x2; } /* 4.7.2 Class-Specific AC Interface Descriptor */ struct uac2_ac_header_descriptor { __u8 bLength; /* 9 */ __u8 bDescriptorType; /* USB_DT_CS_INTERFACE */ __u8 bDescriptorSubtype; /* UAC_MS_HEADER */ __le16 bcdADC; /* 0x0200 */ __u8 bCategory; __le16 wTotalLength; /* includes Unit and Terminal desc. */ __u8 bmControls; } __packed; /* 2.3.1.6 Type I Format Type Descriptor (Frmts20 final.pdf)*/ struct uac2_format_type_i_descriptor { __u8 bLength; /* in bytes: 6 */ __u8 bDescriptorType; /* USB_DT_CS_INTERFACE */ __u8 bDescriptorSubtype; /* FORMAT_TYPE */ __u8 bFormatType; /* FORMAT_TYPE_1 */ __u8 bSubslotSize; /* {1,2,3,4} */ __u8 bBitResolution; } __packed; /* 4.7.2.1 Clock Source Descriptor */ struct uac_clock_source_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bDescriptorSubtype; __u8 bClockID; __u8 bmAttributes; __u8 bmControls; __u8 bAssocTerminal; __u8 iClockSource; } __attribute__((packed)); /* bmAttribute fields */ #define UAC_CLOCK_SOURCE_TYPE_EXT 0x0 #define UAC_CLOCK_SOURCE_TYPE_INT_FIXED 0x1 #define UAC_CLOCK_SOURCE_TYPE_INT_VAR 0x2 #define UAC_CLOCK_SOURCE_TYPE_INT_PROG 0x3 #define UAC_CLOCK_SOURCE_SYNCED_TO_SOF (1 << 2) /* 4.7.2.2 Clock Source Descriptor */ struct uac_clock_selector_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bDescriptorSubtype; __u8 bClockID; __u8 bNrInPins; __u8 baCSourceID[]; /* bmControls and iClockSource omitted */ } __attribute__((packed)); /* 4.7.2.3 Clock Multiplier Descriptor */ struct uac_clock_multiplier_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bDescriptorSubtype; __u8 bClockID; __u8 bCSourceID; __u8 bmControls; __u8 iClockMultiplier; } __attribute__((packed)); /* 4.7.2.4 Input terminal descriptor */ struct uac2_input_terminal_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bDescriptorSubtype; __u8 bTerminalID; __le16 wTerminalType; __u8 bAssocTerminal; __u8 bCSourceID; __u8 bNrChannels; __le32 bmChannelConfig; __u8 iChannelNames; __le16 bmControls; __u8 iTerminal; } __attribute__((packed)); /* 4.7.2.5 Output terminal descriptor */ struct uac2_output_terminal_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bDescriptorSubtype; __u8 bTerminalID; __le16 wTerminalType; __u8 bAssocTerminal; __u8 bSourceID; __u8 bCSourceID; __le16 bmControls; __u8 iTerminal; } __attribute__((packed)); /* 4.7.2.8 Feature Unit Descriptor */ struct uac2_feature_unit_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bDescriptorSubtype; __u8 bUnitID; __u8 bSourceID; /* bmaControls is actually u32, * but u8 is needed for the hybrid parser */ __u8 bmaControls[]; /* variable length */ } __attribute__((packed)); #define UAC2_DT_FEATURE_UNIT_SIZE(ch) (6 + ((ch) + 1) * 4) /* As above, but more useful for defining your own descriptors: */ #define DECLARE_UAC2_FEATURE_UNIT_DESCRIPTOR(ch) \ struct uac2_feature_unit_descriptor_##ch { \ __u8 bLength; \ __u8 bDescriptorType; \ __u8 bDescriptorSubtype; \ __u8 bUnitID; \ __u8 bSourceID; \ __le32 bmaControls[ch + 1]; \ __u8 iFeature; \ } __packed /* 4.7.2.10 Effect Unit Descriptor */ struct uac2_effect_unit_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bDescriptorSubtype; __u8 bUnitID; __le16 wEffectType; __u8 bSourceID; __u8 bmaControls[]; /* variable length */ } __attribute__((packed)); /* 4.9.2 Class-Specific AS Interface Descriptor */ struct uac2_as_header_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bDescriptorSubtype; __u8 bTerminalLink; __u8 bmControls; __u8 bFormatType; __le32 bmFormats; __u8 bNrChannels; __le32 bmChannelConfig; __u8 iChannelNames; } __attribute__((packed)); #define UAC2_FORMAT_TYPE_I_RAW_DATA (1 << 31) /* 4.10.1.2 Class-Specific AS Isochronous Audio Data Endpoint Descriptor */ struct uac2_iso_endpoint_descriptor { __u8 bLength; /* in bytes: 8 */ __u8 bDescriptorType; /* USB_DT_CS_ENDPOINT */ __u8 bDescriptorSubtype; /* EP_GENERAL */ __u8 bmAttributes; __u8 bmControls; __u8 bLockDelayUnits; __le16 wLockDelay; } __attribute__((packed)); #define UAC2_CONTROL_PITCH (3 << 0) #define UAC2_CONTROL_DATA_OVERRUN (3 << 2) #define UAC2_CONTROL_DATA_UNDERRUN (3 << 4) /* 5.2.5.4.2 Connector Control Parameter Block */ struct uac2_connectors_ctl_blk { __u8 bNrChannels; __le32 bmChannelConfig; __u8 iChannelNames; } __attribute__((packed)); /* 6.1 Interrupt Data Message */ #define UAC2_INTERRUPT_DATA_MSG_VENDOR (1 << 0) #define UAC2_INTERRUPT_DATA_MSG_EP (1 << 1) struct uac2_interrupt_data_msg { __u8 bInfo; __u8 bAttribute; __le16 wValue; __le16 wIndex; } __attribute__((packed)); /* A.7 Audio Function Category Codes */ #define UAC2_FUNCTION_SUBCLASS_UNDEFINED 0x00 #define UAC2_FUNCTION_DESKTOP_SPEAKER 0x01 #define UAC2_FUNCTION_HOME_THEATER 0x02 #define UAC2_FUNCTION_MICROPHONE 0x03 #define UAC2_FUNCTION_HEADSET 0x04 #define UAC2_FUNCTION_TELEPHONE 0x05 #define UAC2_FUNCTION_CONVERTER 0x06 #define UAC2_FUNCTION_SOUND_RECORDER 0x07 #define UAC2_FUNCTION_IO_BOX 0x08 #define UAC2_FUNCTION_MUSICAL_INSTRUMENT 0x09 #define UAC2_FUNCTION_PRO_AUDIO 0x0a #define UAC2_FUNCTION_AUDIO_VIDEO 0x0b #define UAC2_FUNCTION_CONTROL_PANEL 0x0c #define UAC2_FUNCTION_OTHER 0xff /* A.9 Audio Class-Specific AC Interface Descriptor Subtypes */ /* see audio.h for the rest, which is identical to v1 */ #define UAC2_EFFECT_UNIT 0x07 #define UAC2_PROCESSING_UNIT_V2 0x08 #define UAC2_EXTENSION_UNIT_V2 0x09 #define UAC2_CLOCK_SOURCE 0x0a #define UAC2_CLOCK_SELECTOR 0x0b #define UAC2_CLOCK_MULTIPLIER 0x0c #define UAC2_SAMPLE_RATE_CONVERTER 0x0d /* A.10 Audio Class-Specific AS Interface Descriptor Subtypes */ /* see audio.h for the rest, which is identical to v1 */ #define UAC2_ENCODER 0x03 #define UAC2_DECODER 0x04 /* A.11 Effect Unit Effect Types */ #define UAC2_EFFECT_UNDEFINED 0x00 #define UAC2_EFFECT_PARAM_EQ 0x01 #define UAC2_EFFECT_REVERB 0x02 #define UAC2_EFFECT_MOD_DELAY 0x03 #define UAC2_EFFECT_DYN_RANGE_COMP 0x04 /* A.12 Processing Unit Process Types */ #define UAC2_PROCESS_UNDEFINED 0x00 #define UAC2_PROCESS_UP_DOWNMIX 0x01 #define UAC2_PROCESS_DOLBY_PROLOCIC 0x02 #define UAC2_PROCESS_STEREO_EXTENDER 0x03 /* A.14 Audio Class-Specific Request Codes */ #define UAC2_CS_CUR 0x01 #define UAC2_CS_RANGE 0x02 #define UAC2_CS_MEM 0x03 /* A.15 Encoder Type Codes */ #define UAC2_ENCODER_UNDEFINED 0x00 #define UAC2_ENCODER_OTHER 0x01 #define UAC2_ENCODER_MPEG 0x02 #define UAC2_ENCODER_AC3 0x03 #define UAC2_ENCODER_WMA 0x04 #define UAC2_ENCODER_DTS 0x05 /* A.16 Decoder Type Codes */ #define UAC2_DECODER_UNDEFINED 0x00 #define UAC2_DECODER_OTHER 0x01 #define UAC2_DECODER_MPEG 0x02 #define UAC2_DECODER_AC3 0x03 #define UAC2_DECODER_WMA 0x04 #define UAC2_DECODER_DTS 0x05 /* A.17.1 Clock Source Control Selectors */ #define UAC2_CS_UNDEFINED 0x00 #define UAC2_CS_CONTROL_SAM_FREQ 0x01 #define UAC2_CS_CONTROL_CLOCK_VALID 0x02 /* A.17.2 Clock Selector Control Selectors */ #define UAC2_CX_UNDEFINED 0x00 #define UAC2_CX_CLOCK_SELECTOR 0x01 /* A.17.3 Clock Multiplier Control Selectors */ #define UAC2_CM_UNDEFINED 0x00 #define UAC2_CM_NUMERATOR 0x01 #define UAC2_CM_DENOMINTATOR 0x02 /* A.17.4 Terminal Control Selectors */ #define UAC2_TE_UNDEFINED 0x00 #define UAC2_TE_COPY_PROTECT 0x01 #define UAC2_TE_CONNECTOR 0x02 #define UAC2_TE_OVERLOAD 0x03 #define UAC2_TE_CLUSTER 0x04 #define UAC2_TE_UNDERFLOW 0x05 #define UAC2_TE_OVERFLOW 0x06 #define UAC2_TE_LATENCY 0x07 /* A.17.5 Mixer Control Selectors */ #define UAC2_MU_UNDEFINED 0x00 #define UAC2_MU_MIXER 0x01 #define UAC2_MU_CLUSTER 0x02 #define UAC2_MU_UNDERFLOW 0x03 #define UAC2_MU_OVERFLOW 0x04 #define UAC2_MU_LATENCY 0x05 /* A.17.6 Selector Control Selectors */ #define UAC2_SU_UNDEFINED 0x00 #define UAC2_SU_SELECTOR 0x01 #define UAC2_SU_LATENCY 0x02 /* A.17.7 Feature Unit Control Selectors */ /* see audio.h for the rest, which is identical to v1 */ #define UAC2_FU_INPUT_GAIN 0x0b #define UAC2_FU_INPUT_GAIN_PAD 0x0c #define UAC2_FU_PHASE_INVERTER 0x0d #define UAC2_FU_UNDERFLOW 0x0e #define UAC2_FU_OVERFLOW 0x0f #define UAC2_FU_LATENCY 0x10 /* A.17.8.1 Parametric Equalizer Section Effect Unit Control Selectors */ #define UAC2_PE_UNDEFINED 0x00 #define UAC2_PE_ENABLE 0x01 #define UAC2_PE_CENTERFREQ 0x02 #define UAC2_PE_QFACTOR 0x03 #define UAC2_PE_GAIN 0x04 #define UAC2_PE_UNDERFLOW 0x05 #define UAC2_PE_OVERFLOW 0x06 #define UAC2_PE_LATENCY 0x07 /* A.17.8.2 Reverberation Effect Unit Control Selectors */ #define UAC2_RV_UNDEFINED 0x00 #define UAC2_RV_ENABLE 0x01 #define UAC2_RV_TYPE 0x02 #define UAC2_RV_LEVEL 0x03 #define UAC2_RV_TIME 0x04 #define UAC2_RV_FEEDBACK 0x05 #define UAC2_RV_PREDELAY 0x06 #define UAC2_RV_DENSITY 0x07 #define UAC2_RV_HIFREQ_ROLLOFF 0x08 #define UAC2_RV_UNDERFLOW 0x09 #define UAC2_RV_OVERFLOW 0x0a #define UAC2_RV_LATENCY 0x0b /* A.17.8.3 Modulation Delay Effect Control Selectors */ #define UAC2_MD_UNDEFINED 0x00 #define UAC2_MD_ENABLE 0x01 #define UAC2_MD_BALANCE 0x02 #define UAC2_MD_RATE 0x03 #define UAC2_MD_DEPTH 0x04 #define UAC2_MD_TIME 0x05 #define UAC2_MD_FEEDBACK 0x06 #define UAC2_MD_UNDERFLOW 0x07 #define UAC2_MD_OVERFLOW 0x08 #define UAC2_MD_LATENCY 0x09 /* A.17.8.4 Dynamic Range Compressor Effect Unit Control Selectors */ #define UAC2_DR_UNDEFINED 0x00 #define UAC2_DR_ENABLE 0x01 #define UAC2_DR_COMPRESSION_RATE 0x02 #define UAC2_DR_MAXAMPL 0x03 #define UAC2_DR_THRESHOLD 0x04 #define UAC2_DR_ATTACK_TIME 0x05 #define UAC2_DR_RELEASE_TIME 0x06 #define UAC2_DR_UNDEFLOW 0x07 #define UAC2_DR_OVERFLOW 0x08 #define UAC2_DR_LATENCY 0x09 /* A.17.9.1 Up/Down-mix Processing Unit Control Selectors */ #define UAC2_UD_UNDEFINED 0x00 #define UAC2_UD_ENABLE 0x01 #define UAC2_UD_MODE_SELECT 0x02 #define UAC2_UD_CLUSTER 0x03 #define UAC2_UD_UNDERFLOW 0x04 #define UAC2_UD_OVERFLOW 0x05 #define UAC2_UD_LATENCY 0x06 /* A.17.9.2 Dolby Prologic[tm] Processing Unit Control Selectors */ #define UAC2_DP_UNDEFINED 0x00 #define UAC2_DP_ENABLE 0x01 #define UAC2_DP_MODE_SELECT 0x02 #define UAC2_DP_CLUSTER 0x03 #define UAC2_DP_UNDERFFLOW 0x04 #define UAC2_DP_OVERFLOW 0x05 #define UAC2_DP_LATENCY 0x06 /* A.17.9.3 Stereo Expander Processing Unit Control Selectors */ #define UAC2_ST_EXT_UNDEFINED 0x00 #define UAC2_ST_EXT_ENABLE 0x01 #define UAC2_ST_EXT_WIDTH 0x02 #define UAC2_ST_EXT_UNDEFLOW 0x03 #define UAC2_ST_EXT_OVERFLOW 0x04 #define UAC2_ST_EXT_LATENCY 0x05 /* A.17.10 Extension Unit Control Selectors */ #define UAC2_XU_UNDEFINED 0x00 #define UAC2_XU_ENABLE 0x01 #define UAC2_XU_CLUSTER 0x02 #define UAC2_XU_UNDERFLOW 0x03 #define UAC2_XU_OVERFLOW 0x04 #define UAC2_XU_LATENCY 0x05 /* A.17.11 AudioStreaming Interface Control Selectors */ #define UAC2_AS_UNDEFINED 0x00 #define UAC2_AS_ACT_ALT_SETTING 0x01 #define UAC2_AS_VAL_ALT_SETTINGS 0x02 #define UAC2_AS_AUDIO_DATA_FORMAT 0x03 /* A.17.12 Encoder Control Selectors */ #define UAC2_EN_UNDEFINED 0x00 #define UAC2_EN_BIT_RATE 0x01 #define UAC2_EN_QUALITY 0x02 #define UAC2_EN_VBR 0x03 #define UAC2_EN_TYPE 0x04 #define UAC2_EN_UNDERFLOW 0x05 #define UAC2_EN_OVERFLOW 0x06 #define UAC2_EN_ENCODER_ERROR 0x07 #define UAC2_EN_PARAM1 0x08 #define UAC2_EN_PARAM2 0x09 #define UAC2_EN_PARAM3 0x0a #define UAC2_EN_PARAM4 0x0b #define UAC2_EN_PARAM5 0x0c #define UAC2_EN_PARAM6 0x0d #define UAC2_EN_PARAM7 0x0e #define UAC2_EN_PARAM8 0x0f /* A.17.13.1 MPEG Decoder Control Selectors */ #define UAC2_MPEG_UNDEFINED 0x00 #define UAC2_MPEG_DUAL_CHANNEL 0x01 #define UAC2_MPEG_SECOND_STEREO 0x02 #define UAC2_MPEG_MULTILINGUAL 0x03 #define UAC2_MPEG_DYN_RANGE 0x04 #define UAC2_MPEG_SCALING 0x05 #define UAC2_MPEG_HILO_SCALING 0x06 #define UAC2_MPEG_UNDERFLOW 0x07 #define UAC2_MPEG_OVERFLOW 0x08 #define UAC2_MPEG_DECODER_ERROR 0x09 /* A17.13.2 AC3 Decoder Control Selectors */ #define UAC2_AC3_UNDEFINED 0x00 #define UAC2_AC3_MODE 0x01 #define UAC2_AC3_DYN_RANGE 0x02 #define UAC2_AC3_SCALING 0x03 #define UAC2_AC3_HILO_SCALING 0x04 #define UAC2_AC3_UNDERFLOW 0x05 #define UAC2_AC3_OVERFLOW 0x06 #define UAC2_AC3_DECODER_ERROR 0x07 /* A17.13.3 WMA Decoder Control Selectors */ #define UAC2_WMA_UNDEFINED 0x00 #define UAC2_WMA_UNDERFLOW 0x01 #define UAC2_WMA_OVERFLOW 0x02 #define UAC2_WMA_DECODER_ERROR 0x03 /* A17.13.4 DTS Decoder Control Selectors */ #define UAC2_DTS_UNDEFINED 0x00 #define UAC2_DTS_UNDERFLOW 0x01 #define UAC2_DTS_OVERFLOW 0x02 #define UAC2_DTS_DECODER_ERROR 0x03 /* A17.14 Endpoint Control Selectors */ #define UAC2_EP_CS_UNDEFINED 0x00 #define UAC2_EP_CS_PITCH 0x01 #define UAC2_EP_CS_DATA_OVERRUN 0x02 #define UAC2_EP_CS_DATA_UNDERRUN 0x03 #endif /* __LINUX_USB_AUDIO_V2_H */ |
8 2 3 4 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 | // SPDX-License-Identifier: GPL-2.0-or-later /* * lib/ts_fsm.c A naive finite state machine text search approach * * Authors: Thomas Graf <tgraf@suug.ch> * * ========================================================================== * * A finite state machine consists of n states (struct ts_fsm_token) * representing the pattern as a finite automaton. The data is read * sequentially on an octet basis. Every state token specifies the number * of recurrences and the type of value accepted which can be either a * specific character or ctype based set of characters. The available * type of recurrences include 1, (0|1), [0 n], and [1 n]. * * The algorithm differs between strict/non-strict mode specifying * whether the pattern has to start at the first octet. Strict mode * is enabled by default and can be disabled by inserting * TS_FSM_HEAD_IGNORE as the first token in the chain. * * The runtime performance of the algorithm should be around O(n), * however while in strict mode the average runtime can be better. */ #include <linux/module.h> #include <linux/types.h> #include <linux/string.h> #include <linux/ctype.h> #include <linux/textsearch.h> #include <linux/textsearch_fsm.h> struct ts_fsm { unsigned int ntokens; struct ts_fsm_token tokens[]; }; /* other values derived from ctype.h */ #define _A 0x100 /* ascii */ #define _W 0x200 /* wildcard */ /* Map to _ctype flags and some magic numbers */ static const u16 token_map[TS_FSM_TYPE_MAX+1] = { [TS_FSM_SPECIFIC] = 0, [TS_FSM_WILDCARD] = _W, [TS_FSM_CNTRL] = _C, [TS_FSM_LOWER] = _L, [TS_FSM_UPPER] = _U, [TS_FSM_PUNCT] = _P, [TS_FSM_SPACE] = _S, [TS_FSM_DIGIT] = _D, [TS_FSM_XDIGIT] = _D | _X, [TS_FSM_ALPHA] = _U | _L, [TS_FSM_ALNUM] = _U | _L | _D, [TS_FSM_PRINT] = _P | _U | _L | _D | _SP, [TS_FSM_GRAPH] = _P | _U | _L | _D, [TS_FSM_ASCII] = _A, }; static const u16 token_lookup_tbl[256] = { _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 0- 3 */ _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 4- 7 */ _W|_A|_C, _W|_A|_C|_S, _W|_A|_C|_S, _W|_A|_C|_S, /* 8- 11 */ _W|_A|_C|_S, _W|_A|_C|_S, _W|_A|_C, _W|_A|_C, /* 12- 15 */ _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 16- 19 */ _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 20- 23 */ _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 24- 27 */ _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 28- 31 */ _W|_A|_S|_SP, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 32- 35 */ _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 36- 39 */ _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 40- 43 */ _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 44- 47 */ _W|_A|_D, _W|_A|_D, _W|_A|_D, _W|_A|_D, /* 48- 51 */ _W|_A|_D, _W|_A|_D, _W|_A|_D, _W|_A|_D, /* 52- 55 */ _W|_A|_D, _W|_A|_D, _W|_A|_P, _W|_A|_P, /* 56- 59 */ _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 60- 63 */ _W|_A|_P, _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U|_X, /* 64- 67 */ _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U, /* 68- 71 */ _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 72- 75 */ _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 76- 79 */ _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 80- 83 */ _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 84- 87 */ _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_P, /* 88- 91 */ _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 92- 95 */ _W|_A|_P, _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L|_X, /* 96- 99 */ _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L, /* 100-103 */ _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 104-107 */ _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 108-111 */ _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 112-115 */ _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 116-119 */ _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_P, /* 120-123 */ _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_C, /* 124-127 */ _W, _W, _W, _W, /* 128-131 */ _W, _W, _W, _W, /* 132-135 */ _W, _W, _W, _W, /* 136-139 */ _W, _W, _W, _W, /* 140-143 */ _W, _W, _W, _W, /* 144-147 */ _W, _W, _W, _W, /* 148-151 */ _W, _W, _W, _W, /* 152-155 */ _W, _W, _W, _W, /* 156-159 */ _W|_S|_SP, _W|_P, _W|_P, _W|_P, /* 160-163 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 164-167 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 168-171 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 172-175 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 176-179 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 180-183 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 184-187 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 188-191 */ _W|_U, _W|_U, _W|_U, _W|_U, /* 192-195 */ _W|_U, _W|_U, _W|_U, _W|_U, /* 196-199 */ _W|_U, _W|_U, _W|_U, _W|_U, /* 200-203 */ _W|_U, _W|_U, _W|_U, _W|_U, /* 204-207 */ _W|_U, _W|_U, _W|_U, _W|_U, /* 208-211 */ _W|_U, _W|_U, _W|_U, _W|_P, /* 212-215 */ _W|_U, _W|_U, _W|_U, _W|_U, /* 216-219 */ _W|_U, _W|_U, _W|_U, _W|_L, /* 220-223 */ _W|_L, _W|_L, _W|_L, _W|_L, /* 224-227 */ _W|_L, _W|_L, _W|_L, _W|_L, /* 228-231 */ _W|_L, _W|_L, _W|_L, _W|_L, /* 232-235 */ _W|_L, _W|_L, _W|_L, _W|_L, /* 236-239 */ _W|_L, _W|_L, _W|_L, _W|_L, /* 240-243 */ _W|_L, _W|_L, _W|_L, _W|_P, /* 244-247 */ _W|_L, _W|_L, _W|_L, _W|_L, /* 248-251 */ _W|_L, _W|_L, _W|_L, _W|_L}; /* 252-255 */ static inline int match_token(struct ts_fsm_token *t, u8 d) { if (t->type) return (token_lookup_tbl[d] & t->type) != 0; else return t->value == d; } static unsigned int fsm_find(struct ts_config *conf, struct ts_state *state) { struct ts_fsm *fsm = ts_config_priv(conf); struct ts_fsm_token *cur = NULL, *next; unsigned int match_start, block_idx = 0, tok_idx; unsigned block_len = 0, strict, consumed = state->offset; const u8 *data; #define GET_NEXT_BLOCK() \ ({ consumed += block_idx; \ block_idx = 0; \ block_len = conf->get_next_block(consumed, &data, conf, state); }) #define TOKEN_MISMATCH() \ do { \ if (strict) \ goto no_match; \ block_idx++; \ goto startover; \ } while(0) #define end_of_data() unlikely(block_idx >= block_len && !GET_NEXT_BLOCK()) if (end_of_data()) goto no_match; strict = fsm->tokens[0].recur != TS_FSM_HEAD_IGNORE; startover: match_start = consumed + block_idx; for (tok_idx = 0; tok_idx < fsm->ntokens; tok_idx++) { cur = &fsm->tokens[tok_idx]; if (likely(tok_idx < (fsm->ntokens - 1))) next = &fsm->tokens[tok_idx + 1]; else next = NULL; switch (cur->recur) { case TS_FSM_SINGLE: if (end_of_data()) goto no_match; if (!match_token(cur, data[block_idx])) TOKEN_MISMATCH(); break; case TS_FSM_PERHAPS: if (end_of_data() || !match_token(cur, data[block_idx])) continue; break; case TS_FSM_MULTI: if (end_of_data()) goto no_match; if (!match_token(cur, data[block_idx])) TOKEN_MISMATCH(); block_idx++; fallthrough; case TS_FSM_ANY: if (next == NULL) goto found_match; if (end_of_data()) continue; while (!match_token(next, data[block_idx])) { if (!match_token(cur, data[block_idx])) TOKEN_MISMATCH(); block_idx++; if (end_of_data()) goto no_match; } continue; /* * Optimization: Prefer small local loop over jumping * back and forth until garbage at head is munched. */ case TS_FSM_HEAD_IGNORE: if (end_of_data()) continue; while (!match_token(next, data[block_idx])) { /* * Special case, don't start over upon * a mismatch, give the user the * chance to specify the type of data * allowed to be ignored. */ if (!match_token(cur, data[block_idx])) goto no_match; block_idx++; if (end_of_data()) goto no_match; } match_start = consumed + block_idx; continue; } block_idx++; } if (end_of_data()) goto found_match; no_match: return UINT_MAX; found_match: state->offset = consumed + block_idx; return match_start; } static struct ts_config *fsm_init(const void *pattern, unsigned int len, gfp_t gfp_mask, int flags) { int i, err = -EINVAL; struct ts_config *conf; struct ts_fsm *fsm; struct ts_fsm_token *tokens = (struct ts_fsm_token *) pattern; unsigned int ntokens = len / sizeof(*tokens); size_t priv_size = sizeof(*fsm) + len; if (len % sizeof(struct ts_fsm_token) || ntokens < 1) goto errout; if (flags & TS_IGNORECASE) goto errout; for (i = 0; i < ntokens; i++) { struct ts_fsm_token *t = &tokens[i]; if (t->type > TS_FSM_TYPE_MAX || t->recur > TS_FSM_RECUR_MAX) goto errout; if (t->recur == TS_FSM_HEAD_IGNORE && (i != 0 || i == (ntokens - 1))) goto errout; } conf = alloc_ts_config(priv_size, gfp_mask); if (IS_ERR(conf)) return conf; conf->flags = flags; fsm = ts_config_priv(conf); fsm->ntokens = ntokens; memcpy(fsm->tokens, pattern, len); for (i = 0; i < fsm->ntokens; i++) { struct ts_fsm_token *t = &fsm->tokens[i]; t->type = token_map[t->type]; } return conf; errout: return ERR_PTR(err); } static void *fsm_get_pattern(struct ts_config *conf) { struct ts_fsm *fsm = ts_config_priv(conf); return fsm->tokens; } static unsigned int fsm_get_pattern_len(struct ts_config *conf) { struct ts_fsm *fsm = ts_config_priv(conf); return fsm->ntokens * sizeof(struct ts_fsm_token); } static struct ts_ops fsm_ops = { .name = "fsm", .find = fsm_find, .init = fsm_init, .get_pattern = fsm_get_pattern, .get_pattern_len = fsm_get_pattern_len, .owner = THIS_MODULE, .list = LIST_HEAD_INIT(fsm_ops.list) }; static int __init init_fsm(void) { return textsearch_register(&fsm_ops); } static void __exit exit_fsm(void) { textsearch_unregister(&fsm_ops); } MODULE_LICENSE("GPL"); module_init(init_fsm); module_exit(exit_fsm); |
80 63 2607 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 | /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/userfaultfd_k.h * * Copyright (C) 2015 Red Hat, Inc. * */ #ifndef _LINUX_USERFAULTFD_K_H #define _LINUX_USERFAULTFD_K_H #ifdef CONFIG_USERFAULTFD #include <linux/userfaultfd.h> /* linux/include/uapi/linux/userfaultfd.h */ #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/swapops.h> #include <asm-generic/pgtable_uffd.h> #include <linux/hugetlb_inline.h> /* The set of all possible UFFD-related VM flags. */ #define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR) /* * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining * new flags, since they might collide with O_* ones. We want * to re-use O_* flags that couldn't possibly have a meaning * from userfaultfd, in order to leave a free define-space for * shared O_* flags. */ #define UFFD_CLOEXEC O_CLOEXEC #define UFFD_NONBLOCK O_NONBLOCK #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) /* * Start with fault_pending_wqh and fault_wqh so they're more likely * to be in the same cacheline. * * Locking order: * fd_wqh.lock * fault_pending_wqh.lock * fault_wqh.lock * event_wqh.lock * * To avoid deadlocks, IRQs must be disabled when taking any of the above locks, * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's * also taken in IRQ context. */ /* * ANDROID: CRC fix for commit f91e6b41dd11 ("userfaultfd: move userfaultfd_ctx * struct to header file") */ #ifndef __GENKSYMS__ struct userfaultfd_ctx { /* waitqueue head for the pending (i.e. not read) userfaults */ wait_queue_head_t fault_pending_wqh; /* waitqueue head for the userfaults */ wait_queue_head_t fault_wqh; /* waitqueue head for the pseudo fd to wakeup poll/read */ wait_queue_head_t fd_wqh; /* waitqueue head for events */ wait_queue_head_t event_wqh; /* a refile sequence protected by fault_pending_wqh lock */ seqcount_spinlock_t refile_seq; /* pseudo fd refcounting */ refcount_t refcount; /* userfaultfd syscall flags */ unsigned int flags; /* features requested from the userspace */ unsigned int features; /* released */ bool released; /* * Prevents userfaultfd operations (fill/move/wp) from happening while * some non-cooperative event(s) is taking place. Increments are done * in write-mode. Whereas, userfaultfd operations, which includes * reading mmap_changing, is done under read-mode. */ struct rw_semaphore map_changing_lock; /* memory mappings are changing because of non-cooperative event */ atomic_t mmap_changing; /* mm with one ore more vmas attached to this userfaultfd_ctx */ struct mm_struct *mm; ANDROID_KABI_RESERVE(1); }; #endif extern int sysctl_unprivileged_userfaultfd; extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); /* * The mode of operation for __mcopy_atomic and its helpers. * * This is almost an implementation detail (mcopy_atomic below doesn't take this * as a parameter), but it's exposed here because memory-kind-specific * implementations (e.g. hugetlbfs) need to know the mode of operation. */ enum mcopy_atomic_mode { /* A normal copy_from_user into the destination range. */ MCOPY_ATOMIC_NORMAL, /* Don't copy; map the destination range to the zero page. */ MCOPY_ATOMIC_ZEROPAGE, /* Just install pte(s) with the existing page(s) in the page cache. */ MCOPY_ATOMIC_CONTINUE, }; extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, struct page *page, bool newly_allocated, bool wp_copy); extern ssize_t mcopy_atomic(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, __u64 mode); extern ssize_t mfill_zeropage(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long len); extern ssize_t mcopy_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long len); extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long len, bool enable_wp); extern void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma, unsigned long start, unsigned long len, bool enable_wp); /* move_pages */ void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2); void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2); ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, __u64 flags); int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long dst_addr, unsigned long src_addr); /* mm helpers */ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, struct vm_userfaultfd_ctx vm_ctx) { return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx; } /* * Never enable huge pmd sharing on some uffd registered vmas: * * - VM_UFFD_WP VMAs, because write protect information is per pgtable entry. * * - VM_UFFD_MINOR VMAs, because otherwise we would never get minor faults for * VMAs which share huge pmds. (If you have two mappings to the same * underlying pages, and fault in the non-UFFD-registered one with a write, * with huge pmd sharing this would *also* setup the second UFFD-registered * mapping, and we'd not get minor faults.) */ static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma) { return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR); } /* * Don't do fault around for either WP or MINOR registered uffd range. For * MINOR registered range, fault around will be a total disaster and ptes can * be installed without notifications; for WP it should mostly be fine as long * as the fault around checks for pte_none() before the installation, however * to be super safe we just forbid it. */ static inline bool uffd_disable_fault_around(struct vm_area_struct *vma) { return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR); } static inline bool userfaultfd_missing(struct vm_area_struct *vma) { return vma->vm_flags & VM_UFFD_MISSING; } static inline bool userfaultfd_wp(struct vm_area_struct *vma) { return vma->vm_flags & VM_UFFD_WP; } static inline bool userfaultfd_minor(struct vm_area_struct *vma) { return vma->vm_flags & VM_UFFD_MINOR; } static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma, pte_t pte) { return userfaultfd_wp(vma) && pte_uffd_wp(pte); } static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma, pmd_t pmd) { return userfaultfd_wp(vma) && pmd_uffd_wp(pmd); } static inline bool userfaultfd_armed(struct vm_area_struct *vma) { return vma->vm_flags & __VM_UFFD_FLAGS; } static inline bool vma_can_userfault(struct vm_area_struct *vma, unsigned long vm_flags) { if ((vm_flags & VM_UFFD_MINOR) && (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) return false; #ifndef CONFIG_PTE_MARKER_UFFD_WP /* * If user requested uffd-wp but not enabled pte markers for * uffd-wp, then shmem & hugetlbfs are not supported but only * anonymous. */ if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma)) return false; #endif return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || vma_is_shmem(vma); } extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); extern void dup_userfaultfd_complete(struct list_head *); extern void mremap_userfaultfd_prep(struct vm_area_struct *, struct vm_userfaultfd_ctx *); extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *, unsigned long from, unsigned long to, unsigned long len); extern bool userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *uf); extern void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf); #else /* CONFIG_USERFAULTFD */ /* mm helpers */ static inline vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) { return VM_FAULT_SIGBUS; } static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, struct vm_userfaultfd_ctx vm_ctx) { return true; } static inline bool userfaultfd_missing(struct vm_area_struct *vma) { return false; } static inline bool userfaultfd_wp(struct vm_area_struct *vma) { return false; } static inline bool userfaultfd_minor(struct vm_area_struct *vma) { return false; } static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma, pte_t pte) { return false; } static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma, pmd_t pmd) { return false; } static inline bool userfaultfd_armed(struct vm_area_struct *vma) { return false; } static inline int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *l) { return 0; } static inline void dup_userfaultfd_complete(struct list_head *l) { } static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma, struct vm_userfaultfd_ctx *ctx) { } static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx, unsigned long from, unsigned long to, unsigned long len) { } static inline bool userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end) { return true; } static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *uf) { return 0; } static inline void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf) { } static inline bool uffd_disable_fault_around(struct vm_area_struct *vma) { return false; } #endif /* CONFIG_USERFAULTFD */ static inline bool pte_marker_entry_uffd_wp(swp_entry_t entry) { #ifdef CONFIG_PTE_MARKER_UFFD_WP return is_pte_marker_entry(entry) && (pte_marker_get(entry) & PTE_MARKER_UFFD_WP); #else return false; #endif } static inline bool pte_marker_uffd_wp(pte_t pte) { #ifdef CONFIG_PTE_MARKER_UFFD_WP swp_entry_t entry; if (!is_swap_pte(pte)) return false; entry = pte_to_swp_entry(pte); return pte_marker_entry_uffd_wp(entry); #else return false; #endif } /* * Returns true if this is a swap pte and was uffd-wp wr-protected in either * forms (pte marker or a normal swap pte), false otherwise. */ static inline bool pte_swp_uffd_wp_any(pte_t pte) { #ifdef CONFIG_PTE_MARKER_UFFD_WP if (!is_swap_pte(pte)) return false; if (pte_swp_uffd_wp(pte)) return true; if (pte_marker_uffd_wp(pte)) return true; #endif return false; } #endif /* _LINUX_USERFAULTFD_K_H */ |
84 3 81 42 42 78 81 65 65 65 65 52 52 47 47 47 47 65 65 7 4 4 976 975 65 65 65 65 65 65 65 65 65 65 65 6 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 | // SPDX-License-Identifier: GPL-2.0-or-later #include <net/gro.h> #include <net/dst_metadata.h> #include <net/busy_poll.h> #include <trace/events/net.h> #define MAX_GRO_SKBS 8 /* This should be increased if a protocol with a bigger head is added. */ #define GRO_MAX_HEAD (MAX_HEADER + 128) static DEFINE_SPINLOCK(offload_lock); static struct list_head offload_base __read_mostly = LIST_HEAD_INIT(offload_base); /* Maximum number of GRO_NORMAL skbs to batch up for list-RX */ int gro_normal_batch __read_mostly = 8; /** * dev_add_offload - register offload handlers * @po: protocol offload declaration * * Add protocol offload handlers to the networking stack. The passed * &proto_offload is linked into kernel lists and may not be freed until * it has been removed from the kernel lists. * * This call does not sleep therefore it can not * guarantee all CPU's that are in middle of receiving packets * will see the new offload handlers (until the next received packet). */ void dev_add_offload(struct packet_offload *po) { struct packet_offload *elem; spin_lock(&offload_lock); list_for_each_entry(elem, &offload_base, list) { if (po->priority < elem->priority) break; } list_add_rcu(&po->list, elem->list.prev); spin_unlock(&offload_lock); } EXPORT_SYMBOL(dev_add_offload); /** * __dev_remove_offload - remove offload handler * @po: packet offload declaration * * Remove a protocol offload handler that was previously added to the * kernel offload handlers by dev_add_offload(). The passed &offload_type * is removed from the kernel lists and can be freed or reused once this * function returns. * * The packet type might still be in use by receivers * and must not be freed until after all the CPU's have gone * through a quiescent state. */ static void __dev_remove_offload(struct packet_offload *po) { struct list_head *head = &offload_base; struct packet_offload *po1; spin_lock(&offload_lock); list_for_each_entry(po1, head, list) { if (po == po1) { list_del_rcu(&po->list); goto out; } } pr_warn("dev_remove_offload: %p not found\n", po); out: spin_unlock(&offload_lock); } /** * dev_remove_offload - remove packet offload handler * @po: packet offload declaration * * Remove a packet offload handler that was previously added to the kernel * offload handlers by dev_add_offload(). The passed &offload_type is * removed from the kernel lists and can be freed or reused once this * function returns. * * This call sleeps to guarantee that no CPU is looking at the packet * type after return. */ void dev_remove_offload(struct packet_offload *po) { __dev_remove_offload(po); synchronize_net(); } EXPORT_SYMBOL(dev_remove_offload); /** * skb_eth_gso_segment - segmentation handler for ethernet protocols. * @skb: buffer to segment * @features: features for the output path (see dev->features) * @type: Ethernet Protocol ID */ struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, netdev_features_t features, __be16 type) { struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); struct packet_offload *ptype; rcu_read_lock(); list_for_each_entry_rcu(ptype, &offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { segs = ptype->callbacks.gso_segment(skb, features); break; } } rcu_read_unlock(); return segs; } EXPORT_SYMBOL(skb_eth_gso_segment); /** * skb_mac_gso_segment - mac layer segmentation handler. * @skb: buffer to segment * @features: features for the output path (see dev->features) */ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); struct packet_offload *ptype; int vlan_depth = skb->mac_len; __be16 type = skb_network_protocol(skb, &vlan_depth); if (unlikely(!type)) return ERR_PTR(-EINVAL); __skb_pull(skb, vlan_depth); rcu_read_lock(); list_for_each_entry_rcu(ptype, &offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { segs = ptype->callbacks.gso_segment(skb, features); break; } } rcu_read_unlock(); __skb_push(skb, skb->data - skb_mac_header(skb)); return segs; } EXPORT_SYMBOL(skb_mac_gso_segment); int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) { struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); unsigned int offset = skb_gro_offset(skb); unsigned int headlen = skb_headlen(skb); unsigned int len = skb_gro_len(skb); unsigned int delta_truesize; unsigned int gro_max_size; unsigned int new_truesize; struct sk_buff *lp; int segs; /* Do not splice page pool based packets w/ non-page pool * packets. This can result in reference count issues as page * pool pages will not decrement the reference count and will * instead be immediately returned to the pool or have frag * count decremented. */ if (p->pp_recycle != skb->pp_recycle) return -ETOOMANYREFS; /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ gro_max_size = READ_ONCE(p->dev->gro_max_size); if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush)) return -E2BIG; if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) { if (p->protocol != htons(ETH_P_IPV6) || skb_headroom(p) < sizeof(struct hop_jumbo_hdr) || ipv6_hdr(p)->nexthdr != IPPROTO_TCP || p->encapsulation) return -E2BIG; } segs = NAPI_GRO_CB(skb)->count; lp = NAPI_GRO_CB(p)->last; pinfo = skb_shinfo(lp); if (headlen <= offset) { skb_frag_t *frag; skb_frag_t *frag2; int i = skbinfo->nr_frags; int nr_frags = pinfo->nr_frags + i; if (nr_frags > MAX_SKB_FRAGS) goto merge; offset -= headlen; pinfo->nr_frags = nr_frags; skbinfo->nr_frags = 0; frag = pinfo->frags + nr_frags; frag2 = skbinfo->frags + i; do { *--frag = *--frag2; } while (--i); skb_frag_off_add(frag, offset); skb_frag_size_sub(frag, offset); /* all fragments truesize : remove (head size + sk_buff) */ new_truesize = SKB_TRUESIZE(skb_end_offset(skb)); delta_truesize = skb->truesize - new_truesize; skb->truesize = new_truesize; skb->len -= skb->data_len; skb->data_len = 0; NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE; goto done; } else if (skb->head_frag) { int nr_frags = pinfo->nr_frags; skb_frag_t *frag = pinfo->frags + nr_frags; struct page *page = virt_to_head_page(skb->head); unsigned int first_size = headlen - offset; unsigned int first_offset; if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) goto merge; first_offset = skb->data - (unsigned char *)page_address(page) + offset; pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; __skb_frag_set_page(frag, page); skb_frag_off_set(frag, first_offset); skb_frag_size_set(frag, first_size); memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); /* We dont need to clear skbinfo->nr_frags here */ new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff)); delta_truesize = skb->truesize - new_truesize; skb->truesize = new_truesize; NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; goto done; } merge: /* sk ownership - if any - completely transferred to the aggregated packet */ skb->destructor = NULL; skb->sk = NULL; delta_truesize = skb->truesize; if (offset > headlen) { unsigned int eat = offset - headlen; skb_frag_off_add(&skbinfo->frags[0], eat); skb_frag_size_sub(&skbinfo->frags[0], eat); skb->data_len -= eat; skb->len -= eat; offset = headlen; } __skb_pull(skb, offset); if (NAPI_GRO_CB(p)->last == p) skb_shinfo(p)->frag_list = skb; else NAPI_GRO_CB(p)->last->next = skb; NAPI_GRO_CB(p)->last = skb; __skb_header_release(skb); lp = p; done: NAPI_GRO_CB(p)->count += segs; p->data_len += len; p->truesize += delta_truesize; p->len += len; if (lp != p) { lp->data_len += len; lp->truesize += delta_truesize; lp->len += len; } NAPI_GRO_CB(skb)->same_flow = 1; return 0; } static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) { struct packet_offload *ptype; __be16 type = skb->protocol; struct list_head *head = &offload_base; int err = -ENOENT; BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); if (NAPI_GRO_CB(skb)->count == 1) { skb_shinfo(skb)->gso_size = 0; goto out; } rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { if (ptype->type != type || !ptype->callbacks.gro_complete) continue; err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete, ipv6_gro_complete, inet_gro_complete, skb, 0); break; } rcu_read_unlock(); if (err) { WARN_ON(&ptype->list == head); kfree_skb(skb); return; } out: gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count); } static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, bool flush_old) { struct list_head *head = &napi->gro_hash[index].list; struct sk_buff *skb, *p; list_for_each_entry_safe_reverse(skb, p, head, list) { if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) return; skb_list_del_init(skb); napi_gro_complete(napi, skb); napi->gro_hash[index].count--; } if (!napi->gro_hash[index].count) __clear_bit(index, &napi->gro_bitmask); } /* napi->gro_hash[].list contains packets ordered by age. * youngest packets at the head of it. * Complete skbs in reverse order to reduce latencies. */ void napi_gro_flush(struct napi_struct *napi, bool flush_old) { unsigned long bitmask = napi->gro_bitmask; unsigned int i, base = ~0U; while ((i = ffs(bitmask)) != 0) { bitmask >>= i; base += i; __napi_gro_flush_chain(napi, base, flush_old); } } EXPORT_SYMBOL(napi_gro_flush); static void gro_list_prepare(const struct list_head *head, const struct sk_buff *skb) { unsigned int maclen = skb->dev->hard_header_len; u32 hash = skb_get_hash_raw(skb); struct sk_buff *p; list_for_each_entry(p, head, list) { unsigned long diffs; NAPI_GRO_CB(p)->flush = 0; if (hash != skb_get_hash_raw(p)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb); if (skb_vlan_tag_present(p)) diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb); diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) diffs |= compare_ether_header(skb_mac_header(p), skb_mac_header(skb)); else if (!diffs) diffs = memcmp(skb_mac_header(p), skb_mac_header(skb), maclen); /* in most common scenarions 'slow_gro' is 0 * otherwise we are already on some slower paths * either skip all the infrequent tests altogether or * avoid trying too hard to skip each of them individually */ if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) { #if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) struct tc_skb_ext *skb_ext; struct tc_skb_ext *p_ext; #endif diffs |= p->sk != skb->sk; diffs |= skb_metadata_dst_cmp(p, skb); diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); #if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) skb_ext = skb_ext_find(skb, TC_SKB_EXT); p_ext = skb_ext_find(p, TC_SKB_EXT); diffs |= (!!p_ext) ^ (!!skb_ext); if (!diffs && unlikely(skb_ext)) diffs |= p_ext->chain ^ skb_ext->chain; #endif } NAPI_GRO_CB(p)->same_flow = !diffs; } } static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff) { const struct skb_shared_info *pinfo = skb_shinfo(skb); const skb_frag_t *frag0 = &pinfo->frags[0]; NAPI_GRO_CB(skb)->data_offset = 0; NAPI_GRO_CB(skb)->frag0 = NULL; NAPI_GRO_CB(skb)->frag0_len = 0; if (!skb_headlen(skb) && pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0)) && (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, skb_frag_size(frag0), skb->end - skb->tail); } } static void gro_pull_from_frag0(struct sk_buff *skb, int grow) { struct skb_shared_info *pinfo = skb_shinfo(skb); BUG_ON(skb->end - skb->tail < grow); memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); skb->data_len -= grow; skb->tail += grow; skb_frag_off_add(&pinfo->frags[0], grow); skb_frag_size_sub(&pinfo->frags[0], grow); if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { skb_frag_unref(skb, 0); memmove(pinfo->frags, pinfo->frags + 1, --pinfo->nr_frags * sizeof(pinfo->frags[0])); } } static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) { struct sk_buff *oldest; oldest = list_last_entry(head, struct sk_buff, list); /* We are called with head length >= MAX_GRO_SKBS, so this is * impossible. */ if (WARN_ON_ONCE(!oldest)) return; /* Do not adjust napi->gro_hash[].count, caller is adding a new * SKB to the chain. */ skb_list_del_init(oldest); napi_gro_complete(napi, oldest); } static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); struct gro_list *gro_list = &napi->gro_hash[bucket]; struct list_head *head = &offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; struct sk_buff *pp = NULL; enum gro_result ret; int same_flow; int grow; if (netif_elide_gro(skb->dev)) goto normal; gro_list_prepare(&gro_list->list, skb); rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { if (ptype->type == type && ptype->callbacks.gro_receive) goto found_ptype; } rcu_read_unlock(); goto normal; found_ptype: skb_set_network_header(skb, skb_gro_offset(skb)); skb_reset_mac_len(skb); BUILD_BUG_ON(sizeof_field(struct napi_gro_cb, zeroed) != sizeof(u32)); BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed), sizeof(u32))); /* Avoid slow unaligned acc */ *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); NAPI_GRO_CB(skb)->is_atomic = 1; NAPI_GRO_CB(skb)->count = 1; if (unlikely(skb_is_gso(skb))) { NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs; /* Only support TCP and non DODGY users. */ if (!skb_is_gso_tcp(skb) || (skb_shinfo(skb)->gso_type & SKB_GSO_DODGY)) NAPI_GRO_CB(skb)->flush = 1; } /* Setup for GRO checksum validation */ switch (skb->ip_summed) { case CHECKSUM_COMPLETE: NAPI_GRO_CB(skb)->csum = skb->csum; NAPI_GRO_CB(skb)->csum_valid = 1; break; case CHECKSUM_UNNECESSARY: NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; break; } pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, ipv6_gro_receive, inet_gro_receive, &gro_list->list, skb); rcu_read_unlock(); if (PTR_ERR(pp) == -EINPROGRESS) { ret = GRO_CONSUMED; goto ok; } same_flow = NAPI_GRO_CB(skb)->same_flow; ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; if (pp) { skb_list_del_init(pp); napi_gro_complete(napi, pp); gro_list->count--; } if (same_flow) goto ok; if (NAPI_GRO_CB(skb)->flush) goto normal; if (unlikely(gro_list->count >= MAX_GRO_SKBS)) gro_flush_oldest(napi, &gro_list->list); else gro_list->count++; NAPI_GRO_CB(skb)->age = jiffies; NAPI_GRO_CB(skb)->last = skb; if (!skb_is_gso(skb)) skb_shinfo(skb)->gso_size = skb_gro_len(skb); list_add(&skb->list, &gro_list->list); ret = GRO_HELD; pull: grow = skb_gro_offset(skb) - skb_headlen(skb); if (grow > 0) gro_pull_from_frag0(skb, grow); ok: if (gro_list->count) { if (!test_bit(bucket, &napi->gro_bitmask)) __set_bit(bucket, &napi->gro_bitmask); } else if (test_bit(bucket, &napi->gro_bitmask)) { __clear_bit(bucket, &napi->gro_bitmask); } return ret; normal: ret = GRO_NORMAL; goto pull; } struct packet_offload *gro_find_receive_by_type(__be16 type) { struct list_head *offload_head = &offload_base; struct packet_offload *ptype; list_for_each_entry_rcu(ptype, offload_head, list) { if (ptype->type != type || !ptype->callbacks.gro_receive) continue; return ptype; } return NULL; } EXPORT_SYMBOL(gro_find_receive_by_type); struct packet_offload *gro_find_complete_by_type(__be16 type) { struct list_head *offload_head = &offload_base; struct packet_offload *ptype; list_for_each_entry_rcu(ptype, offload_head, list) { if (ptype->type != type || !ptype->callbacks.gro_complete) continue; return ptype; } return NULL; } EXPORT_SYMBOL(gro_find_complete_by_type); static gro_result_t napi_skb_finish(struct napi_struct *napi, struct sk_buff *skb, gro_result_t ret) { switch (ret) { case GRO_NORMAL: gro_normal_one(napi, skb, 1); break; case GRO_MERGED_FREE: if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) napi_skb_free_stolen_head(skb); else if (skb->fclone != SKB_FCLONE_UNAVAILABLE) __kfree_skb(skb); else __kfree_skb_defer(skb); break; case GRO_HELD: case GRO_MERGED: case GRO_CONSUMED: break; } return ret; } gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { gro_result_t ret; skb_mark_napi_id(skb, napi); trace_napi_gro_receive_entry(skb); skb_gro_reset_offset(skb, 0); ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); trace_napi_gro_receive_exit(ret); return ret; } EXPORT_SYMBOL(napi_gro_receive); static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) { if (unlikely(skb->pfmemalloc)) { consume_skb(skb); return; } __skb_pull(skb, skb_headlen(skb)); /* restore the reserve we had after netdev_alloc_skb_ip_align() */ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); __vlan_hwaccel_clear_tag(skb); skb->dev = napi->dev; skb->skb_iif = 0; /* eth_type_trans() assumes pkt_type is PACKET_HOST */ skb->pkt_type = PACKET_HOST; skb->encapsulation = 0; skb_shinfo(skb)->gso_type = 0; skb_shinfo(skb)->gso_size = 0; if (unlikely(skb->slow_gro)) { skb_orphan(skb); skb_ext_reset(skb); nf_reset_ct(skb); skb->slow_gro = 0; } napi->skb = skb; } struct sk_buff *napi_get_frags(struct napi_struct *napi) { struct sk_buff *skb = napi->skb; if (!skb) { skb = napi_alloc_skb(napi, GRO_MAX_HEAD); if (skb) { napi->skb = skb; skb_mark_napi_id(skb, napi); } } return skb; } EXPORT_SYMBOL(napi_get_frags); static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, gro_result_t ret) { switch (ret) { case GRO_NORMAL: case GRO_HELD: __skb_push(skb, ETH_HLEN); skb->protocol = eth_type_trans(skb, skb->dev); if (ret == GRO_NORMAL) gro_normal_one(napi, skb, 1); break; case GRO_MERGED_FREE: if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) napi_skb_free_stolen_head(skb); else napi_reuse_skb(napi, skb); break; case GRO_MERGED: case GRO_CONSUMED: break; } return ret; } /* Upper GRO stack assumes network header starts at gro_offset=0 * Drivers could call both napi_gro_frags() and napi_gro_receive() * We copy ethernet header into skb->data to have a common layout. */ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) { struct sk_buff *skb = napi->skb; const struct ethhdr *eth; unsigned int hlen = sizeof(*eth); napi->skb = NULL; skb_reset_mac_header(skb); skb_gro_reset_offset(skb, hlen); if (unlikely(skb_gro_header_hard(skb, hlen))) { eth = skb_gro_header_slow(skb, hlen, 0); if (unlikely(!eth)) { net_warn_ratelimited("%s: dropping impossible skb from %s\n", __func__, napi->dev->name); napi_reuse_skb(napi, skb); return NULL; } } else { eth = (const struct ethhdr *)skb->data; gro_pull_from_frag0(skb, hlen); NAPI_GRO_CB(skb)->frag0 += hlen; NAPI_GRO_CB(skb)->frag0_len -= hlen; } __skb_pull(skb, hlen); /* * This works because the only protocols we care about don't require * special handling. * We'll fix it up properly in napi_frags_finish() */ skb->protocol = eth->h_proto; return skb; } gro_result_t napi_gro_frags(struct napi_struct *napi) { gro_result_t ret; struct sk_buff *skb = napi_frags_skb(napi); trace_napi_gro_frags_entry(skb); ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); trace_napi_gro_frags_exit(ret); return ret; } EXPORT_SYMBOL(napi_gro_frags); /* Compute the checksum from gro_offset and return the folded value * after adding in any pseudo checksum. */ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) { __wsum wsum; __sum16 sum; wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); /* See comments in __skb_checksum_complete(). */ if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) netdev_rx_csum_fault(skb->dev, skb); } NAPI_GRO_CB(skb)->csum = wsum; NAPI_GRO_CB(skb)->csum_valid = 1; return sum; } EXPORT_SYMBOL(__skb_gro_checksum_complete); |
11 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 | // SPDX-License-Identifier: GPL-2.0 /* * This file contains the base functions to manage periodic tick * related events. * * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> #include <linux/interrupt.h> #include <linux/nmi.h> #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> #include <linux/module.h> #include <trace/events/power.h> #include <trace/hooks/sched.h> #include <asm/irq_regs.h> #include "tick-internal.h" /* * Tick devices */ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); /* * Tick next event: keeps track of the tick time. It's updated by the * CPU which handles the tick and protected by jiffies_lock. There is * no requirement to write hold the jiffies seqcount for it. */ ktime_t tick_next_period; /* * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR * which is responsible for calling do_timer(), i.e. the timekeeping stuff. This * variable has two functions: * * 1) Prevent a thundering herd issue of a gazillion of CPUs trying to grab the * timekeeping lock all at once. Only the CPU which is assigned to do the * update is handling it. * * 2) Hand off the duty in the NOHZ idle case by setting the value to * TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which looks * at it will take over and keep the time keeping alive. The handover * procedure also covers cpu hotplug. */ int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; #ifdef CONFIG_NO_HZ_FULL /* * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns * tick_do_timer_cpu and it should be taken over by an eligible secondary * when one comes online. */ static int tick_do_timer_boot_cpu __read_mostly = -1; #endif /* * Debugging: see timer_list.c */ struct tick_device *tick_get_device(int cpu) { return &per_cpu(tick_cpu_device, cpu); } /** * tick_is_oneshot_available - check for a oneshot capable event device */ int tick_is_oneshot_available(void) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT)) return 0; if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) return 1; return tick_broadcast_oneshot_available(); } /* * Periodic tick */ static void tick_periodic(int cpu) { if (tick_do_timer_cpu == cpu) { raw_spin_lock(&jiffies_lock); write_seqcount_begin(&jiffies_seq); /* Keep track of the next tick event */ tick_next_period = ktime_add_ns(tick_next_period, TICK_NSEC); do_timer(1); write_seqcount_end(&jiffies_seq); raw_spin_unlock(&jiffies_lock); update_wall_time(); trace_android_vh_jiffies_update(NULL); } update_process_times(user_mode(get_irq_regs())); profile_tick(CPU_PROFILING); } /* * Event handler for periodic ticks */ void tick_handle_periodic(struct clock_event_device *dev) { int cpu = smp_processor_id(); ktime_t next = dev->next_event; tick_periodic(cpu); #if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_NO_HZ_COMMON) /* * The cpu might have transitioned to HIGHRES or NOHZ mode via * update_process_times() -> run_local_timers() -> * hrtimer_run_queues(). */ if (dev->event_handler != tick_handle_periodic) return; #endif if (!clockevent_state_oneshot(dev)) return; for (;;) { /* * Setup the next period for devices, which do not have * periodic mode: */ next = ktime_add_ns(next, TICK_NSEC); if (!clockevents_program_event(dev, next, false)) return; /* * Have to be careful here. If we're in oneshot mode, * before we call tick_periodic() in a loop, we need * to be sure we're using a real hardware clocksource. * Otherwise we could get trapped in an infinite * loop, as the tick_periodic() increments jiffies, * which then will increment time, possibly causing * the loop to trigger again and again. */ if (timekeeping_valid_for_hres()) tick_periodic(cpu); } } /* * Setup the device for a periodic tick */ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) { tick_set_periodic_handler(dev, broadcast); /* Broadcast setup ? */ if (!tick_device_is_functional(dev)) return; if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && !tick_broadcast_oneshot_active()) { clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC); } else { unsigned int seq; ktime_t next; do { seq = read_seqcount_begin(&jiffies_seq); next = tick_next_period; } while (read_seqcount_retry(&jiffies_seq, seq)); clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); for (;;) { if (!clockevents_program_event(dev, next, false)) return; next = ktime_add_ns(next, TICK_NSEC); } } } /* * Setup the tick device */ static void tick_setup_device(struct tick_device *td, struct clock_event_device *newdev, int cpu, const struct cpumask *cpumask) { void (*handler)(struct clock_event_device *) = NULL; ktime_t next_event = 0; /* * First device setup ? */ if (!td->evtdev) { /* * If no cpu took the do_timer update, assign it to * this cpu: */ if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { tick_do_timer_cpu = cpu; tick_next_period = ktime_get(); #ifdef CONFIG_NO_HZ_FULL /* * The boot CPU may be nohz_full, in which case the * first housekeeping secondary will take do_timer() * from it. */ if (tick_nohz_full_cpu(cpu)) tick_do_timer_boot_cpu = cpu; } else if (tick_do_timer_boot_cpu != -1 && !tick_nohz_full_cpu(cpu)) { tick_do_timer_boot_cpu = -1; /* * The boot CPU will stay in periodic (NOHZ disabled) * mode until clocksource_done_booting() called after * smp_init() selects a high resolution clocksource and * timekeeping_notify() kicks the NOHZ stuff alive. * * So this WRITE_ONCE can only race with the READ_ONCE * check in tick_periodic() but this race is harmless. */ WRITE_ONCE(tick_do_timer_cpu, cpu); #endif } /* * Startup in periodic mode first. */ td->mode = TICKDEV_MODE_PERIODIC; } else { handler = td->evtdev->event_handler; next_event = td->evtdev->next_event; td->evtdev->event_handler = clockevents_handle_noop; } td->evtdev = newdev; /* * When the device is not per cpu, pin the interrupt to the * current cpu: */ if (!cpumask_equal(newdev->cpumask, cpumask)) irq_set_affinity(newdev->irq, cpumask); /* * When global broadcasting is active, check if the current * device is registered as a placeholder for broadcast mode. * This allows us to handle this x86 misfeature in a generic * way. This function also returns !=0 when we keep the * current active broadcast state for this CPU. */ if (tick_device_uses_broadcast(newdev, cpu)) return; if (td->mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(newdev, 0); else tick_setup_oneshot(newdev, handler, next_event); } void tick_install_replacement(struct clock_event_device *newdev) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); int cpu = smp_processor_id(); clockevents_exchange_device(td->evtdev, newdev); tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) tick_oneshot_notify(); } static bool tick_check_percpu(struct clock_event_device *curdev, struct clock_event_device *newdev, int cpu) { if (!cpumask_test_cpu(cpu, newdev->cpumask)) return false; if (cpumask_equal(newdev->cpumask, cpumask_of(cpu))) return true; /* Check if irq affinity can be set */ if (newdev->irq >= 0 && !irq_can_set_affinity(newdev->irq)) return false; /* Prefer an existing cpu local device */ if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) return false; return true; } static bool tick_check_preferred(struct clock_event_device *curdev, struct clock_event_device *newdev) { /* Prefer oneshot capable device */ if (!(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) { if (curdev && (curdev->features & CLOCK_EVT_FEAT_ONESHOT)) return false; if (tick_oneshot_mode_active()) return false; } /* * Use the higher rated one, but prefer a CPU local device with a lower * rating than a non-CPU local device */ return !curdev || newdev->rating > curdev->rating || !cpumask_equal(curdev->cpumask, newdev->cpumask); } /* * Check whether the new device is a better fit than curdev. curdev * can be NULL ! */ bool tick_check_replacement(struct clock_event_device *curdev, struct clock_event_device *newdev) { if (!tick_check_percpu(curdev, newdev, smp_processor_id())) return false; return tick_check_preferred(curdev, newdev); } /* * Check, if the new registered device should be used. Called with * clockevents_lock held and interrupts disabled. */ void tick_check_new_device(struct clock_event_device *newdev) { struct clock_event_device *curdev; struct tick_device *td; int cpu; cpu = smp_processor_id(); td = &per_cpu(tick_cpu_device, cpu); curdev = td->evtdev; if (!tick_check_replacement(curdev, newdev)) goto out_bc; if (!try_module_get(newdev->owner)) return; /* * Replace the eventually existing device by the new * device. If the current device is the broadcast device, do * not give it back to the clockevents layer ! */ if (tick_is_broadcast_device(curdev)) { clockevents_shutdown(curdev); curdev = NULL; } clockevents_exchange_device(curdev, newdev); tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) tick_oneshot_notify(); return; out_bc: /* * Can the new device be used as a broadcast device ? */ tick_install_broadcast_device(newdev, cpu); } /** * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode * @state: The target state (enter/exit) * * The system enters/leaves a state, where affected devices might stop * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. * * Called with interrupts disabled, so clockevents_lock is not * required here because the local clock event device cannot go away * under us. */ int tick_broadcast_oneshot_control(enum tick_broadcast_state state) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); if (!(td->evtdev->features & CLOCK_EVT_FEAT_C3STOP)) return 0; return __tick_broadcast_oneshot_control(state); } EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control); #ifdef CONFIG_HOTPLUG_CPU /* * Transfer the do_timer job away from a dying cpu. * * Called with interrupts disabled. No locking required. If * tick_do_timer_cpu is owned by this cpu, nothing can change it. */ void tick_handover_do_timer(void) { if (tick_do_timer_cpu == smp_processor_id()) tick_do_timer_cpu = cpumask_first(cpu_online_mask); } /* * Shutdown an event device on a given cpu: * * This is called on a life CPU, when a CPU is dead. So we cannot * access the hardware device itself. * We just set the mode and remove it from the lists. */ void tick_shutdown(unsigned int cpu) { struct tick_device *td = &per_cpu(tick_cpu_device, cpu); struct clock_event_device *dev = td->evtdev; td->mode = TICKDEV_MODE_PERIODIC; if (dev) { /* * Prevent that the clock events layer tries to call * the set mode function! */ clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); clockevents_exchange_device(dev, NULL); dev->event_handler = clockevents_handle_noop; td->evtdev = NULL; } } #endif /** * tick_suspend_local - Suspend the local tick device * * Called from the local cpu for freeze with interrupts disabled. * * No locks required. Nothing can change the per cpu device. */ void tick_suspend_local(void) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); clockevents_shutdown(td->evtdev); } /** * tick_resume_local - Resume the local tick device * * Called from the local CPU for unfreeze or XEN resume magic. * * No locks required. Nothing can change the per cpu device. */ void tick_resume_local(void) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); bool broadcast = tick_resume_check_broadcast(); clockevents_tick_resume(td->evtdev); if (!broadcast) { if (td->mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(td->evtdev, 0); else tick_resume_oneshot(); } /* * Ensure that hrtimers are up to date and the clockevents device * is reprogrammed correctly when high resolution timers are * enabled. */ hrtimers_resume_local(); } /** * tick_suspend - Suspend the tick and the broadcast device * * Called from syscore_suspend() via timekeeping_suspend with only one * CPU online and interrupts disabled or from tick_unfreeze() under * tick_freeze_lock. * * No locks required. Nothing can change the per cpu device. */ void tick_suspend(void) { tick_suspend_local(); tick_suspend_broadcast(); } /** * tick_resume - Resume the tick and the broadcast device * * Called from syscore_resume() via timekeeping_resume with only one * CPU online and interrupts disabled. * * No locks required. Nothing can change the per cpu device. */ void tick_resume(void) { tick_resume_broadcast(); tick_resume_local(); } #ifdef CONFIG_SUSPEND static DEFINE_RAW_SPINLOCK(tick_freeze_lock); static unsigned int tick_freeze_depth; /** * tick_freeze - Suspend the local tick and (possibly) timekeeping. * * Check if this is the last online CPU executing the function and if so, * suspend timekeeping. Otherwise suspend the local tick. * * Call with interrupts disabled. Must be balanced with %tick_unfreeze(). * Interrupts must not be enabled before the subsequent %tick_unfreeze(). */ void tick_freeze(void) { raw_spin_lock(&tick_freeze_lock); tick_freeze_depth++; if (tick_freeze_depth == num_online_cpus()) { trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), true); system_state = SYSTEM_SUSPEND; sched_clock_suspend(); timekeeping_suspend(); } else { tick_suspend_local(); } raw_spin_unlock(&tick_freeze_lock); } /** * tick_unfreeze - Resume the local tick and (possibly) timekeeping. * * Check if this is the first CPU executing the function and if so, resume * timekeeping. Otherwise resume the local tick. * * Call with interrupts disabled. Must be balanced with %tick_freeze(). * Interrupts must not be enabled after the preceding %tick_freeze(). */ void tick_unfreeze(void) { raw_spin_lock(&tick_freeze_lock); if (tick_freeze_depth == num_online_cpus()) { timekeeping_resume(); sched_clock_resume(); system_state = SYSTEM_RUNNING; trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), false); } else { touch_softlockup_watchdog(); tick_resume_local(); } tick_freeze_depth--; raw_spin_unlock(&tick_freeze_lock); } #endif /* CONFIG_SUSPEND */ /** * tick_init - initialize the tick control */ void __init tick_init(void) { tick_broadcast_init(); tick_nohz_init(); } |
33 63 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * inet6 interface/address list definitions * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> */ #ifndef _NET_IF_INET6_H #define _NET_IF_INET6_H #include <net/snmp.h> #include <linux/ipv6.h> #include <linux/refcount.h> /* inet6_dev.if_flags */ #define IF_RA_OTHERCONF 0x80 #define IF_RA_MANAGED 0x40 #define IF_RA_RCVD 0x20 #define IF_RS_SENT 0x10 #define IF_READY 0x80000000 enum { INET6_IFADDR_STATE_PREDAD, INET6_IFADDR_STATE_DAD, INET6_IFADDR_STATE_POSTDAD, INET6_IFADDR_STATE_ERRDAD, INET6_IFADDR_STATE_DEAD, }; struct inet6_ifaddr { struct in6_addr addr; __u32 prefix_len; __u32 rt_priority; /* In seconds, relative to tstamp. Expiry is at tstamp + HZ * lft. */ __u32 valid_lft; __u32 prefered_lft; refcount_t refcnt; spinlock_t lock; int state; __u32 flags; __u8 dad_probes; __u8 stable_privacy_retry; __u16 scope; __u64 dad_nonce; unsigned long cstamp; /* created timestamp */ unsigned long tstamp; /* updated timestamp */ struct delayed_work dad_work; struct inet6_dev *idev; struct fib6_info *rt; struct hlist_node addr_lst; struct list_head if_list; /* * Used to safely traverse idev->addr_list in process context * if the idev->lock needed to protect idev->addr_list cannot be held. * In that case, add the items to this list temporarily and iterate * without holding idev->lock. * See addrconf_ifdown and dev_forward_change. */ struct list_head if_list_aux; struct list_head tmp_list; struct inet6_ifaddr *ifpub; int regen_count; bool tokenized; u8 ifa_proto; struct rcu_head rcu; struct in6_addr peer_addr; }; struct ip6_sf_socklist { unsigned int sl_max; unsigned int sl_count; struct rcu_head rcu; struct in6_addr sl_addr[]; }; #define IP6_SFBLOCK 10 /* allocate this many at once */ struct ipv6_mc_socklist { struct in6_addr addr; int ifindex; unsigned int sfmode; /* MCAST_{INCLUDE,EXCLUDE} */ struct ipv6_mc_socklist __rcu *next; struct ip6_sf_socklist __rcu *sflist; struct rcu_head rcu; }; struct ip6_sf_list { struct ip6_sf_list __rcu *sf_next; struct in6_addr sf_addr; unsigned long sf_count[2]; /* include/exclude counts */ unsigned char sf_gsresp; /* include in g & s response? */ unsigned char sf_oldin; /* change state */ unsigned char sf_crcount; /* retrans. left to send */ struct rcu_head rcu; }; #define MAF_TIMER_RUNNING 0x01 #define MAF_LAST_REPORTER 0x02 #define MAF_LOADED 0x04 #define MAF_NOREPORT 0x08 #define MAF_GSQUERY 0x10 struct ifmcaddr6 { struct in6_addr mca_addr; struct inet6_dev *idev; struct ifmcaddr6 __rcu *next; struct ip6_sf_list __rcu *mca_sources; struct ip6_sf_list __rcu *mca_tomb; unsigned int mca_sfmode; unsigned char mca_crcount; unsigned long mca_sfcount[2]; struct delayed_work mca_work; unsigned int mca_flags; int mca_users; refcount_t mca_refcnt; unsigned long mca_cstamp; unsigned long mca_tstamp; struct rcu_head rcu; }; /* Anycast stuff */ struct ipv6_ac_socklist { struct in6_addr acl_addr; int acl_ifindex; struct ipv6_ac_socklist *acl_next; }; struct ifacaddr6 { struct in6_addr aca_addr; struct fib6_info *aca_rt; struct ifacaddr6 *aca_next; struct hlist_node aca_addr_lst; int aca_users; refcount_t aca_refcnt; unsigned long aca_cstamp; unsigned long aca_tstamp; struct rcu_head rcu; }; #define IFA_HOST IPV6_ADDR_LOOPBACK #define IFA_LINK IPV6_ADDR_LINKLOCAL #define IFA_SITE IPV6_ADDR_SITELOCAL struct ipv6_devstat { struct proc_dir_entry *proc_dir_entry; DEFINE_SNMP_STAT(struct ipstats_mib, ipv6); DEFINE_SNMP_STAT_ATOMIC(struct icmpv6_mib_device, icmpv6dev); DEFINE_SNMP_STAT_ATOMIC(struct icmpv6msg_mib_device, icmpv6msgdev); }; struct inet6_dev { struct net_device *dev; netdevice_tracker dev_tracker; struct list_head addr_list; struct ifmcaddr6 __rcu *mc_list; struct ifmcaddr6 __rcu *mc_tomb; unsigned char mc_qrv; /* Query Robustness Variable */ unsigned char mc_gq_running; unsigned char mc_ifc_count; unsigned char mc_dad_count; unsigned long mc_v1_seen; /* Max time we stay in MLDv1 mode */ unsigned long mc_qi; /* Query Interval */ unsigned long mc_qri; /* Query Response Interval */ unsigned long mc_maxdelay; struct delayed_work mc_gq_work; /* general query work */ struct delayed_work mc_ifc_work; /* interface change work */ struct delayed_work mc_dad_work; /* dad complete mc work */ struct delayed_work mc_query_work; /* mld query work */ struct delayed_work mc_report_work; /* mld report work */ struct sk_buff_head mc_query_queue; /* mld query queue */ struct sk_buff_head mc_report_queue; /* mld report queue */ spinlock_t mc_query_lock; /* mld query queue lock */ spinlock_t mc_report_lock; /* mld query report lock */ struct mutex mc_lock; /* mld global lock */ struct ifacaddr6 *ac_list; rwlock_t lock; refcount_t refcnt; __u32 if_flags; int dead; u32 desync_factor; struct list_head tempaddr_list; struct in6_addr token; struct neigh_parms *nd_parms; struct ipv6_devconf cnf; struct ipv6_devstat stats; struct timer_list rs_timer; __s32 rs_interval; /* in jiffies */ __u8 rs_probes; unsigned long tstamp; /* ipv6InterfaceTable update timestamp */ struct rcu_head rcu; unsigned int ra_mtu; }; static inline void ipv6_eth_mc_map(const struct in6_addr *addr, char *buf) { /* * +-------+-------+-------+-------+-------+-------+ * | 33 | 33 | DST13 | DST14 | DST15 | DST16 | * +-------+-------+-------+-------+-------+-------+ */ buf[0]= 0x33; buf[1]= 0x33; memcpy(buf + 2, &addr->s6_addr32[3], sizeof(__u32)); } static inline void ipv6_arcnet_mc_map(const struct in6_addr *addr, char *buf) { buf[0] = 0x00; } static inline void ipv6_ib_mc_map(const struct in6_addr *addr, const unsigned char *broadcast, char *buf) { unsigned char scope = broadcast[5] & 0xF; buf[0] = 0; /* Reserved */ buf[1] = 0xff; /* Multicast QPN */ buf[2] = 0xff; buf[3] = 0xff; buf[4] = 0xff; buf[5] = 0x10 | scope; /* scope from broadcast address */ buf[6] = 0x60; /* IPv6 signature */ buf[7] = 0x1b; buf[8] = broadcast[8]; /* P_Key */ buf[9] = broadcast[9]; memcpy(buf + 10, addr->s6_addr + 6, 10); } static inline int ipv6_ipgre_mc_map(const struct in6_addr *addr, const unsigned char *broadcast, char *buf) { if ((broadcast[0] | broadcast[1] | broadcast[2] | broadcast[3]) != 0) { memcpy(buf, broadcast, 4); } else { /* v4mapped? */ if ((addr->s6_addr32[0] | addr->s6_addr32[1] | (addr->s6_addr32[2] ^ htonl(0x0000ffff))) != 0) return -EINVAL; memcpy(buf, &addr->s6_addr32[3], 4); } return 0; } #endif |
5 1 4 7 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 | // SPDX-License-Identifier: GPL-2.0-only /* * (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2011 Patrick McHardy <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_nat.h> static int xt_nat_checkentry_v0(const struct xt_tgchk_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; if (mr->rangesize != 1) { pr_info_ratelimited("multiple ranges no longer supported\n"); return -EINVAL; } return nf_ct_netns_get(par->net, par->family); } static int xt_nat_checkentry(const struct xt_tgchk_param *par) { return nf_ct_netns_get(par->net, par->family); } static void xt_nat_destroy(const struct xt_tgdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static void xt_nat_convert_range(struct nf_nat_range2 *dst, const struct nf_nat_ipv4_range *src) { memset(&dst->min_addr, 0, sizeof(dst->min_addr)); memset(&dst->max_addr, 0, sizeof(dst->max_addr)); memset(&dst->base_proto, 0, sizeof(dst->base_proto)); dst->flags = src->flags; dst->min_addr.ip = src->min_ip; dst->max_addr.ip = src->max_ip; dst->min_proto = src->min; dst->max_proto = src->max; } static unsigned int xt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; struct nf_nat_range2 range; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct != NULL && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); xt_nat_convert_range(&range, &mr->range[0]); return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); } static unsigned int xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; struct nf_nat_range2 range; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct != NULL && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED))); xt_nat_convert_range(&range, &mr->range[0]); return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); } static unsigned int xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_range *range_v1 = par->targinfo; struct nf_nat_range2 range; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct != NULL && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); memcpy(&range, range_v1, sizeof(*range_v1)); memset(&range.base_proto, 0, sizeof(range.base_proto)); return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); } static unsigned int xt_dnat_target_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_range *range_v1 = par->targinfo; struct nf_nat_range2 range; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct != NULL && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED))); memcpy(&range, range_v1, sizeof(*range_v1)); memset(&range.base_proto, 0, sizeof(range.base_proto)); return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); } static unsigned int xt_snat_target_v2(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_range2 *range = par->targinfo; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct != NULL && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC); } static unsigned int xt_dnat_target_v2(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_range2 *range = par->targinfo; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct != NULL && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED))); return nf_nat_setup_info(ct, range, NF_NAT_MANIP_DST); } static struct xt_target xt_nat_target_reg[] __read_mostly = { { .name = "SNAT", .revision = 0, .checkentry = xt_nat_checkentry_v0, .destroy = xt_nat_destroy, .target = xt_snat_target_v0, .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), .family = NFPROTO_IPV4, .table = "nat", .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, }, { .name = "DNAT", .revision = 0, .checkentry = xt_nat_checkentry_v0, .destroy = xt_nat_destroy, .target = xt_dnat_target_v0, .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), .family = NFPROTO_IPV4, .table = "nat", .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), .me = THIS_MODULE, }, { .name = "SNAT", .revision = 1, .checkentry = xt_nat_checkentry, .destroy = xt_nat_destroy, .target = xt_snat_target_v1, .targetsize = sizeof(struct nf_nat_range), .table = "nat", .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, }, { .name = "DNAT", .revision = 1, .checkentry = xt_nat_checkentry, .destroy = xt_nat_destroy, .target = xt_dnat_target_v1, .targetsize = sizeof(struct nf_nat_range), .table = "nat", .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), .me = THIS_MODULE, }, { .name = "SNAT", .revision = 2, .checkentry = xt_nat_checkentry, .destroy = xt_nat_destroy, .target = xt_snat_target_v2, .targetsize = sizeof(struct nf_nat_range2), .table = "nat", .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, }, { .name = "DNAT", .revision = 2, .checkentry = xt_nat_checkentry, .destroy = xt_nat_destroy, .target = xt_dnat_target_v2, .targetsize = sizeof(struct nf_nat_range2), .table = "nat", .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), .me = THIS_MODULE, }, }; static int __init xt_nat_init(void) { return xt_register_targets(xt_nat_target_reg, ARRAY_SIZE(xt_nat_target_reg)); } static void __exit xt_nat_exit(void) { xt_unregister_targets(xt_nat_target_reg, ARRAY_SIZE(xt_nat_target_reg)); } module_init(xt_nat_init); module_exit(xt_nat_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_ALIAS("ipt_SNAT"); MODULE_ALIAS("ipt_DNAT"); MODULE_ALIAS("ip6t_SNAT"); MODULE_ALIAS("ip6t_DNAT"); MODULE_DESCRIPTION("SNAT and DNAT targets support"); |
90 89 90 4 88 19 134 71 71 71 78 1 1 91 1 90 90 4 88 88 15 15 3 1 9 3 13 3 2 1 1 1 1 8 1 186 1 72 2298 2282 190 90 2285 435 2293 169 58 58 3 1 3 5 3 2 8 8 40 8 9 30 40 1 30 40 27 1 5 25 26 1 12 13 1 39 2 37 9 16 15 1 1 1 1 2 1 1 15 1 7 425 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET 802.1Q VLAN * Ethernet-type device handling. * * Authors: Ben Greear <greearb@candelatech.com> * Please send support related email to: netdev@vger.kernel.org * VLAN Home Page: http://www.candelatech.com/~greear/vlan.html * * Fixes: * Fix for packet capture - Nick Eggleston <nick@dccinc.com>; * Add HW acceleration hooks - David S. Miller <davem@redhat.com>; * Correct all the locking - David S. Miller <davem@redhat.com>; * Use hash table for VLAN groups - David S. Miller <davem@redhat.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/capability.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/rculist.h> #include <net/p8022.h> #include <net/arp.h> #include <linux/rtnetlink.h> #include <linux/notifier.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/uaccess.h> #include <linux/if_vlan.h> #include "vlan.h" #include "vlanproc.h" #define DRV_VERSION "1.8" /* Global VLAN variables */ unsigned int vlan_net_id __read_mostly; const char vlan_fullname[] = "802.1Q VLAN Support"; const char vlan_version[] = DRV_VERSION; /* End of global variables definitions. */ static int vlan_group_prealloc_vid(struct vlan_group *vg, __be16 vlan_proto, u16 vlan_id) { struct net_device **array; unsigned int vidx; unsigned int size; int pidx; ASSERT_RTNL(); pidx = vlan_proto_idx(vlan_proto); if (pidx < 0) return -EINVAL; vidx = vlan_id / VLAN_GROUP_ARRAY_PART_LEN; array = vg->vlan_devices_arrays[pidx][vidx]; if (array != NULL) return 0; size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN; array = kzalloc(size, GFP_KERNEL_ACCOUNT); if (array == NULL) return -ENOBUFS; /* paired with smp_rmb() in __vlan_group_get_device() */ smp_wmb(); vg->vlan_devices_arrays[pidx][vidx] = array; return 0; } static void vlan_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev, struct vlan_dev_priv *vlan) { if (!(vlan->flags & VLAN_FLAG_BRIDGE_BINDING)) netif_stacked_transfer_operstate(rootdev, dev); } void unregister_vlan_dev(struct net_device *dev, struct list_head *head) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; struct vlan_info *vlan_info; struct vlan_group *grp; u16 vlan_id = vlan->vlan_id; ASSERT_RTNL(); vlan_info = rtnl_dereference(real_dev->vlan_info); BUG_ON(!vlan_info); grp = &vlan_info->grp; grp->nr_vlan_devs--; if (vlan->flags & VLAN_FLAG_MVRP) vlan_mvrp_request_leave(dev); if (vlan->flags & VLAN_FLAG_GVRP) vlan_gvrp_request_leave(dev); vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, NULL); netdev_upper_dev_unlink(real_dev, dev); /* Because unregister_netdevice_queue() makes sure at least one rcu * grace period is respected before device freeing, * we dont need to call synchronize_net() here. */ unregister_netdevice_queue(dev, head); if (grp->nr_vlan_devs == 0) { vlan_mvrp_uninit_applicant(real_dev); vlan_gvrp_uninit_applicant(real_dev); } vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); } int vlan_check_real_dev(struct net_device *real_dev, __be16 protocol, u16 vlan_id, struct netlink_ext_ack *extack) { const char *name = real_dev->name; if (real_dev->features & NETIF_F_VLAN_CHALLENGED) { pr_info("VLANs not supported on %s\n", name); NL_SET_ERR_MSG_MOD(extack, "VLANs not supported on device"); return -EOPNOTSUPP; } if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL) { NL_SET_ERR_MSG_MOD(extack, "VLAN device already exists"); return -EEXIST; } return 0; } int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; u16 vlan_id = vlan->vlan_id; struct vlan_info *vlan_info; struct vlan_group *grp; int err; err = vlan_vid_add(real_dev, vlan->vlan_proto, vlan_id); if (err) return err; vlan_info = rtnl_dereference(real_dev->vlan_info); /* vlan_info should be there now. vlan_vid_add took care of it */ BUG_ON(!vlan_info); grp = &vlan_info->grp; if (grp->nr_vlan_devs == 0) { err = vlan_gvrp_init_applicant(real_dev); if (err < 0) goto out_vid_del; err = vlan_mvrp_init_applicant(real_dev); if (err < 0) goto out_uninit_gvrp; } err = vlan_group_prealloc_vid(grp, vlan->vlan_proto, vlan_id); if (err < 0) goto out_uninit_mvrp; err = register_netdevice(dev); if (err < 0) goto out_uninit_mvrp; err = netdev_upper_dev_link(real_dev, dev, extack); if (err) goto out_unregister_netdev; vlan_stacked_transfer_operstate(real_dev, dev, vlan); linkwatch_fire_event(dev); /* _MUST_ call rfc2863_policy() */ /* So, got the sucker initialized, now lets place * it into our local structure. */ vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, dev); grp->nr_vlan_devs++; return 0; out_unregister_netdev: unregister_netdevice(dev); out_uninit_mvrp: if (grp->nr_vlan_devs == 0) vlan_mvrp_uninit_applicant(real_dev); out_uninit_gvrp: if (grp->nr_vlan_devs == 0) vlan_gvrp_uninit_applicant(real_dev); out_vid_del: vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); return err; } /* Attach a VLAN device to a mac address (ie Ethernet Card). * Returns 0 if the device was created or a negative error code otherwise. */ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) { struct net_device *new_dev; struct vlan_dev_priv *vlan; struct net *net = dev_net(real_dev); struct vlan_net *vn = net_generic(net, vlan_net_id); char name[IFNAMSIZ]; int err; if (vlan_id >= VLAN_VID_MASK) return -ERANGE; err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id, NULL); if (err < 0) return err; /* Gotta set up the fields for the device. */ switch (vn->name_type) { case VLAN_NAME_TYPE_RAW_PLUS_VID: /* name will look like: eth1.0005 */ snprintf(name, IFNAMSIZ, "%s.%.4i", real_dev->name, vlan_id); break; case VLAN_NAME_TYPE_PLUS_VID_NO_PAD: /* Put our vlan.VID in the name. * Name will look like: vlan5 */ snprintf(name, IFNAMSIZ, "vlan%i", vlan_id); break; case VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD: /* Put our vlan.VID in the name. * Name will look like: eth0.5 */ snprintf(name, IFNAMSIZ, "%s.%i", real_dev->name, vlan_id); break; case VLAN_NAME_TYPE_PLUS_VID: /* Put our vlan.VID in the name. * Name will look like: vlan0005 */ default: snprintf(name, IFNAMSIZ, "vlan%.4i", vlan_id); } new_dev = alloc_netdev(sizeof(struct vlan_dev_priv), name, NET_NAME_UNKNOWN, vlan_setup); if (new_dev == NULL) return -ENOBUFS; dev_net_set(new_dev, net); /* need 4 bytes for extra VLAN header info, * hope the underlying device can handle it. */ new_dev->mtu = real_dev->mtu; vlan = vlan_dev_priv(new_dev); vlan->vlan_proto = htons(ETH_P_8021Q); vlan->vlan_id = vlan_id; vlan->real_dev = real_dev; vlan->dent = NULL; vlan->flags = VLAN_FLAG_REORDER_HDR; new_dev->rtnl_link_ops = &vlan_link_ops; err = register_vlan_dev(new_dev, NULL); if (err < 0) goto out_free_newdev; return 0; out_free_newdev: free_netdev(new_dev); return err; } static void vlan_sync_address(struct net_device *dev, struct net_device *vlandev) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); /* May be called without an actual change */ if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr)) return; /* vlan continues to inherit address of lower device */ if (vlan_dev_inherit_address(vlandev, dev)) goto out; /* vlan address was different from the old address and is equal to * the new address */ if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_del(dev, vlandev->dev_addr); /* vlan address was equal to the old address and is different from * the new address */ if (ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && !ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_add(dev, vlandev->dev_addr); out: ether_addr_copy(vlan->real_dev_addr, dev->dev_addr); } static void vlan_transfer_features(struct net_device *dev, struct net_device *vlandev) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); netif_inherit_tso_max(vlandev, dev); if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto)) vlandev->hard_header_len = dev->hard_header_len; else vlandev->hard_header_len = dev->hard_header_len + VLAN_HLEN; #if IS_ENABLED(CONFIG_FCOE) vlandev->fcoe_ddp_xid = dev->fcoe_ddp_xid; #endif vlandev->priv_flags &= ~IFF_XMIT_DST_RELEASE; vlandev->priv_flags |= (vlan->real_dev->priv_flags & IFF_XMIT_DST_RELEASE); vlandev->hw_enc_features = vlan_tnl_features(vlan->real_dev); netdev_update_features(vlandev); } static int __vlan_device_event(struct net_device *dev, unsigned long event) { int err = 0; switch (event) { case NETDEV_CHANGENAME: vlan_proc_rem_dev(dev); err = vlan_proc_add_dev(dev); break; case NETDEV_REGISTER: err = vlan_proc_add_dev(dev); break; case NETDEV_UNREGISTER: vlan_proc_rem_dev(dev); break; } return err; } static int vlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vlan_group *grp; struct vlan_info *vlan_info; int i, flgs; struct net_device *vlandev; struct vlan_dev_priv *vlan; bool last = false; LIST_HEAD(list); int err; if (is_vlan_dev(dev)) { int err = __vlan_device_event(dev, event); if (err) return notifier_from_errno(err); } if ((event == NETDEV_UP) && (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) { pr_info("adding VLAN 0 to HW filter on device %s\n", dev->name); vlan_vid_add(dev, htons(ETH_P_8021Q), 0); } if (event == NETDEV_DOWN && (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) vlan_vid_del(dev, htons(ETH_P_8021Q), 0); vlan_info = rtnl_dereference(dev->vlan_info); if (!vlan_info) goto out; grp = &vlan_info->grp; /* It is OK that we do not hold the group lock right now, * as we run under the RTNL lock. */ switch (event) { case NETDEV_CHANGE: /* Propagate real device state to vlan devices */ vlan_group_for_each_dev(grp, i, vlandev) vlan_stacked_transfer_operstate(dev, vlandev, vlan_dev_priv(vlandev)); break; case NETDEV_CHANGEADDR: /* Adjust unicast filters on underlying device */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = vlandev->flags; if (!(flgs & IFF_UP)) continue; vlan_sync_address(dev, vlandev); } break; case NETDEV_CHANGEMTU: vlan_group_for_each_dev(grp, i, vlandev) { if (vlandev->mtu <= dev->mtu) continue; dev_set_mtu(vlandev, dev->mtu); } break; case NETDEV_FEAT_CHANGE: /* Propagate device features to underlying device */ vlan_group_for_each_dev(grp, i, vlandev) vlan_transfer_features(dev, vlandev); break; case NETDEV_DOWN: { struct net_device *tmp; LIST_HEAD(close_list); /* Put all VLANs for this dev in the down state too. */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = vlandev->flags; if (!(flgs & IFF_UP)) continue; vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) list_add(&vlandev->close_list, &close_list); } dev_close_many(&close_list, false); list_for_each_entry_safe(vlandev, tmp, &close_list, close_list) { vlan_stacked_transfer_operstate(dev, vlandev, vlan_dev_priv(vlandev)); list_del_init(&vlandev->close_list); } list_del(&close_list); break; } case NETDEV_UP: /* Put all VLANs for this dev in the up state too. */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = dev_get_flags(vlandev); if (flgs & IFF_UP) continue; vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) dev_change_flags(vlandev, flgs | IFF_UP, extack); vlan_stacked_transfer_operstate(dev, vlandev, vlan); } break; case NETDEV_UNREGISTER: /* twiddle thumbs on netns device moves */ if (dev->reg_state != NETREG_UNREGISTERING) break; vlan_group_for_each_dev(grp, i, vlandev) { /* removal of last vid destroys vlan_info, abort * afterwards */ if (vlan_info->nr_vids == 1) last = true; unregister_vlan_dev(vlandev, &list); if (last) break; } unregister_netdevice_many(&list); break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlaying device to change its type. */ if (vlan_uses_dev(dev)) return NOTIFY_BAD; break; case NETDEV_NOTIFY_PEERS: case NETDEV_BONDING_FAILOVER: case NETDEV_RESEND_IGMP: /* Propagate to vlan devices */ vlan_group_for_each_dev(grp, i, vlandev) call_netdevice_notifiers(event, vlandev); break; case NETDEV_CVLAN_FILTER_PUSH_INFO: err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021Q)); if (err) return notifier_from_errno(err); break; case NETDEV_CVLAN_FILTER_DROP_INFO: vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021Q)); break; case NETDEV_SVLAN_FILTER_PUSH_INFO: err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021AD)); if (err) return notifier_from_errno(err); break; case NETDEV_SVLAN_FILTER_DROP_INFO: vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021AD)); break; } out: return NOTIFY_DONE; } static struct notifier_block vlan_notifier_block __read_mostly = { .notifier_call = vlan_device_event, }; /* * VLAN IOCTL handler. * o execute requested action or pass command to the device driver * arg is really a struct vlan_ioctl_args __user *. */ static int vlan_ioctl_handler(struct net *net, void __user *arg) { int err; struct vlan_ioctl_args args; struct net_device *dev = NULL; if (copy_from_user(&args, arg, sizeof(struct vlan_ioctl_args))) return -EFAULT; /* Null terminate this sucker, just in case. */ args.device1[sizeof(args.device1) - 1] = 0; args.u.device2[sizeof(args.u.device2) - 1] = 0; rtnl_lock(); switch (args.cmd) { case SET_VLAN_INGRESS_PRIORITY_CMD: case SET_VLAN_EGRESS_PRIORITY_CMD: case SET_VLAN_FLAG_CMD: case ADD_VLAN_CMD: case DEL_VLAN_CMD: case GET_VLAN_REALDEV_NAME_CMD: case GET_VLAN_VID_CMD: err = -ENODEV; dev = __dev_get_by_name(net, args.device1); if (!dev) goto out; err = -EINVAL; if (args.cmd != ADD_VLAN_CMD && !is_vlan_dev(dev)) goto out; } switch (args.cmd) { case SET_VLAN_INGRESS_PRIORITY_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; vlan_dev_set_ingress_priority(dev, args.u.skb_priority, args.vlan_qos); err = 0; break; case SET_VLAN_EGRESS_PRIORITY_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = vlan_dev_set_egress_priority(dev, args.u.skb_priority, args.vlan_qos); break; case SET_VLAN_FLAG_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = vlan_dev_change_flags(dev, args.vlan_qos ? args.u.flag : 0, args.u.flag); break; case SET_VLAN_NAME_TYPE_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; if (args.u.name_type < VLAN_NAME_TYPE_HIGHEST) { struct vlan_net *vn; vn = net_generic(net, vlan_net_id); vn->name_type = args.u.name_type; err = 0; } else { err = -EINVAL; } break; case ADD_VLAN_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = register_vlan_device(dev, args.u.VID); break; case DEL_VLAN_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; unregister_vlan_dev(dev, NULL); err = 0; break; case GET_VLAN_REALDEV_NAME_CMD: err = 0; vlan_dev_get_realdev_name(dev, args.u.device2, sizeof(args.u.device2)); if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) err = -EFAULT; break; case GET_VLAN_VID_CMD: err = 0; args.u.VID = vlan_dev_vlan_id(dev); if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) err = -EFAULT; break; default: err = -EOPNOTSUPP; break; } out: rtnl_unlock(); return err; } static int __net_init vlan_init_net(struct net *net) { struct vlan_net *vn = net_generic(net, vlan_net_id); int err; vn->name_type = VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD; err = vlan_proc_init(net); return err; } static void __net_exit vlan_exit_net(struct net *net) { vlan_proc_cleanup(net); } static struct pernet_operations vlan_net_ops = { .init = vlan_init_net, .exit = vlan_exit_net, .id = &vlan_net_id, .size = sizeof(struct vlan_net), }; static int __init vlan_proto_init(void) { int err; pr_info("%s v%s\n", vlan_fullname, vlan_version); err = register_pernet_subsys(&vlan_net_ops); if (err < 0) goto err0; err = register_netdevice_notifier(&vlan_notifier_block); if (err < 0) goto err2; err = vlan_gvrp_init(); if (err < 0) goto err3; err = vlan_mvrp_init(); if (err < 0) goto err4; err = vlan_netlink_init(); if (err < 0) goto err5; vlan_ioctl_set(vlan_ioctl_handler); return 0; err5: vlan_mvrp_uninit(); err4: vlan_gvrp_uninit(); err3: unregister_netdevice_notifier(&vlan_notifier_block); err2: unregister_pernet_subsys(&vlan_net_ops); err0: return err; } static void __exit vlan_cleanup_module(void) { vlan_ioctl_set(NULL); vlan_netlink_fini(); unregister_netdevice_notifier(&vlan_notifier_block); unregister_pernet_subsys(&vlan_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ vlan_mvrp_uninit(); vlan_gvrp_uninit(); } module_init(vlan_proto_init); module_exit(vlan_cleanup_module); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_VERSION); |
25257 26594 25246 23 19 131 131 8 58 58 1 2 2 52 2365 9 11 15 11 347 311 53 11 16 11 4 11 7 25 25 25 3 12 7 7 6 546 520 23 14 254 250 1 5 141 12 10 71 43 3 27 87 69 148 6 164 159 6 153 1 2 5 6 1 457 389 390 235 236 37 38 1 1 1 366 343 1 1 3 1 1 1 6 2 1 1 1 1 1 1 1 1 1 1 1 4 2 1 2 3737 3738 2932 2933 2551 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 | // SPDX-License-Identifier: GPL-2.0-or-later /* Common capabilities, needed by capability.o. */ #include <linux/capability.h> #include <linux/audit.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/lsm_hooks.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/ptrace.h> #include <linux/xattr.h> #include <linux/hugetlb.h> #include <linux/mount.h> #include <linux/sched.h> #include <linux/prctl.h> #include <linux/securebits.h> #include <linux/user_namespace.h> #include <linux/binfmts.h> #include <linux/personality.h> #include <linux/mnt_idmapping.h> /* * If a non-root user executes a setuid-root binary in * !secure(SECURE_NOROOT) mode, then we raise capabilities. * However if fE is also set, then the intent is for only * the file capabilities to be applied, and the setuid-root * bit is left on either to change the uid (plausible) or * to get full privilege on a kernel without file capabilities * support. So in that case we do not raise capabilities. * * Warn if that happens, once per boot. */ static void warn_setuid_and_fcaps_mixed(const char *fname) { static int warned; if (!warned) { printk(KERN_INFO "warning: `%s' has both setuid-root and" " effective capabilities. Therefore not raising all" " capabilities.\n", fname); warned = 1; } } /** * cap_capable - Determine whether a task has a particular effective capability * @cred: The credentials to use * @targ_ns: The user namespace in which we need the capability * @cap: The capability to check for * @opts: Bitmask of options defined in include/linux/security.h * * Determine whether the nominated task has the specified capability amongst * its effective set, returning 0 if it does, -ve if it does not. * * NOTE WELL: cap_has_capability() cannot be used like the kernel's capable() * and has_capability() functions. That is, it has the reverse semantics: * cap_has_capability() returns 0 when a task has a capability, but the * kernel's capable() and has_capability() returns 1 for this case. */ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns, int cap, unsigned int opts) { struct user_namespace *ns = targ_ns; /* See if cred has the capability in the target user namespace * by examining the target user namespace and all of the target * user namespace's parents. */ for (;;) { /* Do we have the necessary capabilities? */ if (ns == cred->user_ns) return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM; /* * If we're already at a lower level than we're looking for, * we're done searching. */ if (ns->level <= cred->user_ns->level) return -EPERM; /* * The owner of the user namespace in the parent of the * user namespace has all caps. */ if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid)) return 0; /* * If you have a capability in a parent user ns, then you have * it over all children user namespaces as well. */ ns = ns->parent; } /* We never get here */ } /** * cap_settime - Determine whether the current process may set the system clock * @ts: The time to set * @tz: The timezone to set * * Determine whether the current process may set the system clock and timezone * information, returning 0 if permission granted, -ve if denied. */ int cap_settime(const struct timespec64 *ts, const struct timezone *tz) { if (!capable(CAP_SYS_TIME)) return -EPERM; return 0; } /** * cap_ptrace_access_check - Determine whether the current process may access * another * @child: The process to be accessed * @mode: The mode of attachment. * * If we are in the same or an ancestor user_ns and have all the target * task's capabilities, then ptrace access is allowed. * If we have the ptrace capability to the target user_ns, then ptrace * access is allowed. * Else denied. * * Determine whether a process may access another, returning 0 if permission * granted, -ve if denied. */ int cap_ptrace_access_check(struct task_struct *child, unsigned int mode) { int ret = 0; const struct cred *cred, *child_cred; const kernel_cap_t *caller_caps; rcu_read_lock(); cred = current_cred(); child_cred = __task_cred(child); if (mode & PTRACE_MODE_FSCREDS) caller_caps = &cred->cap_effective; else caller_caps = &cred->cap_permitted; if (cred->user_ns == child_cred->user_ns && cap_issubset(child_cred->cap_permitted, *caller_caps)) goto out; if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE)) goto out; ret = -EPERM; out: rcu_read_unlock(); return ret; } /** * cap_ptrace_traceme - Determine whether another process may trace the current * @parent: The task proposed to be the tracer * * If parent is in the same or an ancestor user_ns and has all current's * capabilities, then ptrace access is allowed. * If parent has the ptrace capability to current's user_ns, then ptrace * access is allowed. * Else denied. * * Determine whether the nominated task is permitted to trace the current * process, returning 0 if permission is granted, -ve if denied. */ int cap_ptrace_traceme(struct task_struct *parent) { int ret = 0; const struct cred *cred, *child_cred; rcu_read_lock(); cred = __task_cred(parent); child_cred = current_cred(); if (cred->user_ns == child_cred->user_ns && cap_issubset(child_cred->cap_permitted, cred->cap_permitted)) goto out; if (has_ns_capability(parent, child_cred->user_ns, CAP_SYS_PTRACE)) goto out; ret = -EPERM; out: rcu_read_unlock(); return ret; } /** * cap_capget - Retrieve a task's capability sets * @target: The task from which to retrieve the capability sets * @effective: The place to record the effective set * @inheritable: The place to record the inheritable set * @permitted: The place to record the permitted set * * This function retrieves the capabilities of the nominated task and returns * them to the caller. */ int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { const struct cred *cred; /* Derived from kernel/capability.c:sys_capget. */ rcu_read_lock(); cred = __task_cred(target); *effective = cred->cap_effective; *inheritable = cred->cap_inheritable; *permitted = cred->cap_permitted; rcu_read_unlock(); return 0; } /* * Determine whether the inheritable capabilities are limited to the old * permitted set. Returns 1 if they are limited, 0 if they are not. */ static inline int cap_inh_is_capped(void) { /* they are so limited unless the current task has the CAP_SETPCAP * capability */ if (cap_capable(current_cred(), current_cred()->user_ns, CAP_SETPCAP, CAP_OPT_NONE) == 0) return 0; return 1; } /** * cap_capset - Validate and apply proposed changes to current's capabilities * @new: The proposed new credentials; alterations should be made here * @old: The current task's current credentials * @effective: A pointer to the proposed new effective capabilities set * @inheritable: A pointer to the proposed new inheritable capabilities set * @permitted: A pointer to the proposed new permitted capabilities set * * This function validates and applies a proposed mass change to the current * process's capability sets. The changes are made to the proposed new * credentials, and assuming no error, will be committed by the caller of LSM. */ int cap_capset(struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted) { if (cap_inh_is_capped() && !cap_issubset(*inheritable, cap_combine(old->cap_inheritable, old->cap_permitted))) /* incapable of using this inheritable set */ return -EPERM; if (!cap_issubset(*inheritable, cap_combine(old->cap_inheritable, old->cap_bset))) /* no new pI capabilities outside bounding set */ return -EPERM; /* verify restrictions on target's new Permitted set */ if (!cap_issubset(*permitted, old->cap_permitted)) return -EPERM; /* verify the _new_Effective_ is a subset of the _new_Permitted_ */ if (!cap_issubset(*effective, *permitted)) return -EPERM; new->cap_effective = *effective; new->cap_inheritable = *inheritable; new->cap_permitted = *permitted; /* * Mask off ambient bits that are no longer both permitted and * inheritable. */ new->cap_ambient = cap_intersect(new->cap_ambient, cap_intersect(*permitted, *inheritable)); if (WARN_ON(!cap_ambient_invariant_ok(new))) return -EINVAL; return 0; } /** * cap_inode_need_killpriv - Determine if inode change affects privileges * @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV * * Determine if an inode having a change applied that's marked ATTR_KILL_PRIV * affects the security markings on that inode, and if it is, should * inode_killpriv() be invoked or the change rejected. * * Return: 1 if security.capability has a value, meaning inode_killpriv() * is required, 0 otherwise, meaning inode_killpriv() is not required. */ int cap_inode_need_killpriv(struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); int error; error = __vfs_getxattr(dentry, inode, XATTR_NAME_CAPS, NULL, 0); return error > 0; } /** * cap_inode_killpriv - Erase the security markings on an inode * * @mnt_userns: user namespace of the mount the inode was found from * @dentry: The inode/dentry to alter * * Erase the privilege-enhancing security markings on an inode. * * If the inode has been found through an idmapped mount the user namespace of * the vfsmount must be passed through @mnt_userns. This function will then * take care to map the inode according to @mnt_userns before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply passs init_user_ns. * * Return: 0 if successful, -ve on error. */ int cap_inode_killpriv(struct user_namespace *mnt_userns, struct dentry *dentry) { int error; error = __vfs_removexattr(mnt_userns, dentry, XATTR_NAME_CAPS); if (error == -EOPNOTSUPP) error = 0; return error; } static bool rootid_owns_currentns(kuid_t kroot) { struct user_namespace *ns; if (!uid_valid(kroot)) return false; for (ns = current_user_ns(); ; ns = ns->parent) { if (from_kuid(ns, kroot) == 0) return true; if (ns == &init_user_ns) break; } return false; } static __u32 sansflags(__u32 m) { return m & ~VFS_CAP_FLAGS_EFFECTIVE; } static bool is_v2header(size_t size, const struct vfs_cap_data *cap) { if (size != XATTR_CAPS_SZ_2) return false; return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_2; } static bool is_v3header(size_t size, const struct vfs_cap_data *cap) { if (size != XATTR_CAPS_SZ_3) return false; return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3; } /* * getsecurity: We are called for security.* before any attempt to read the * xattr from the inode itself. * * This gives us a chance to read the on-disk value and convert it. If we * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler. * * Note we are not called by vfs_getxattr_alloc(), but that is only called * by the integrity subsystem, which really wants the unconverted values - * so that's good. */ int cap_inode_getsecurity(struct user_namespace *mnt_userns, struct inode *inode, const char *name, void **buffer, bool alloc) { int size, ret; kuid_t kroot; u32 nsmagic, magic; uid_t root, mappedroot; char *tmpbuf = NULL; struct vfs_cap_data *cap; struct vfs_ns_cap_data *nscap = NULL; struct dentry *dentry; struct user_namespace *fs_ns; if (strcmp(name, "capability") != 0) return -EOPNOTSUPP; dentry = d_find_any_alias(inode); if (!dentry) return -EINVAL; size = sizeof(struct vfs_ns_cap_data); ret = (int)vfs_getxattr_alloc(mnt_userns, dentry, XATTR_NAME_CAPS, &tmpbuf, size, GFP_NOFS); dput(dentry); if (ret < 0 || !tmpbuf) { size = ret; goto out_free; } fs_ns = inode->i_sb->s_user_ns; cap = (struct vfs_cap_data *) tmpbuf; if (is_v2header((size_t) ret, cap)) { root = 0; } else if (is_v3header((size_t) ret, cap)) { nscap = (struct vfs_ns_cap_data *) tmpbuf; root = le32_to_cpu(nscap->rootid); } else { size = -EINVAL; goto out_free; } kroot = make_kuid(fs_ns, root); /* If this is an idmapped mount shift the kuid. */ kroot = mapped_kuid_fs(mnt_userns, fs_ns, kroot); /* If the root kuid maps to a valid uid in current ns, then return * this as a nscap. */ mappedroot = from_kuid(current_user_ns(), kroot); if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) { size = sizeof(struct vfs_ns_cap_data); if (alloc) { if (!nscap) { /* v2 -> v3 conversion */ nscap = kzalloc(size, GFP_ATOMIC); if (!nscap) { size = -ENOMEM; goto out_free; } nsmagic = VFS_CAP_REVISION_3; magic = le32_to_cpu(cap->magic_etc); if (magic & VFS_CAP_FLAGS_EFFECTIVE) nsmagic |= VFS_CAP_FLAGS_EFFECTIVE; memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32); nscap->magic_etc = cpu_to_le32(nsmagic); } else { /* use allocated v3 buffer */ tmpbuf = NULL; } nscap->rootid = cpu_to_le32(mappedroot); *buffer = nscap; } goto out_free; } if (!rootid_owns_currentns(kroot)) { size = -EOVERFLOW; goto out_free; } /* This comes from a parent namespace. Return as a v2 capability */ size = sizeof(struct vfs_cap_data); if (alloc) { if (nscap) { /* v3 -> v2 conversion */ cap = kzalloc(size, GFP_ATOMIC); if (!cap) { size = -ENOMEM; goto out_free; } magic = VFS_CAP_REVISION_2; nsmagic = le32_to_cpu(nscap->magic_etc); if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE) magic |= VFS_CAP_FLAGS_EFFECTIVE; memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32); cap->magic_etc = cpu_to_le32(magic); } else { /* use unconverted v2 */ tmpbuf = NULL; } *buffer = cap; } out_free: kfree(tmpbuf); return size; } /** * rootid_from_xattr - translate root uid of vfs caps * * @value: vfs caps value which may be modified by this function * @size: size of @ivalue * @task_ns: user namespace of the caller * @mnt_userns: user namespace of the mount the inode was found from * @fs_userns: user namespace of the filesystem * * If the inode has been found through an idmapped mount the user namespace of * the vfsmount must be passed through @mnt_userns. This function will then * take care to map the inode according to @mnt_userns before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply passs init_user_ns. */ static kuid_t rootid_from_xattr(const void *value, size_t size, struct user_namespace *task_ns, struct user_namespace *mnt_userns, struct user_namespace *fs_userns) { const struct vfs_ns_cap_data *nscap = value; kuid_t rootkid; uid_t rootid = 0; if (size == XATTR_CAPS_SZ_3) rootid = le32_to_cpu(nscap->rootid); rootkid = make_kuid(task_ns, rootid); return mapped_kuid_user(mnt_userns, fs_userns, rootkid); } static bool validheader(size_t size, const struct vfs_cap_data *cap) { return is_v2header(size, cap) || is_v3header(size, cap); } /** * cap_convert_nscap - check vfs caps * * @mnt_userns: user namespace of the mount the inode was found from * @dentry: used to retrieve inode to check permissions on * @ivalue: vfs caps value which may be modified by this function * @size: size of @ivalue * * User requested a write of security.capability. If needed, update the * xattr to change from v2 to v3, or to fixup the v3 rootid. * * If the inode has been found through an idmapped mount the user namespace of * the vfsmount must be passed through @mnt_userns. This function will then * take care to map the inode according to @mnt_userns before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply passs init_user_ns. * * Return: On success, return the new size; on error, return < 0. */ int cap_convert_nscap(struct user_namespace *mnt_userns, struct dentry *dentry, const void **ivalue, size_t size) { struct vfs_ns_cap_data *nscap; uid_t nsrootid; const struct vfs_cap_data *cap = *ivalue; __u32 magic, nsmagic; struct inode *inode = d_backing_inode(dentry); struct user_namespace *task_ns = current_user_ns(), *fs_ns = inode->i_sb->s_user_ns; kuid_t rootid; size_t newsize; if (!*ivalue) return -EINVAL; if (!validheader(size, cap)) return -EINVAL; if (!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_SETFCAP)) return -EPERM; if (size == XATTR_CAPS_SZ_2 && (mnt_userns == fs_ns)) if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP)) /* user is privileged, just write the v2 */ return size; rootid = rootid_from_xattr(*ivalue, size, task_ns, mnt_userns, fs_ns); if (!uid_valid(rootid)) return -EINVAL; nsrootid = from_kuid(fs_ns, rootid); if (nsrootid == -1) return -EINVAL; newsize = sizeof(struct vfs_ns_cap_data); nscap = kmalloc(newsize, GFP_ATOMIC); if (!nscap) return -ENOMEM; nscap->rootid = cpu_to_le32(nsrootid); nsmagic = VFS_CAP_REVISION_3; magic = le32_to_cpu(cap->magic_etc); if (magic & VFS_CAP_FLAGS_EFFECTIVE) nsmagic |= VFS_CAP_FLAGS_EFFECTIVE; nscap->magic_etc = cpu_to_le32(nsmagic); memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32); *ivalue = nscap; return newsize; } /* * Calculate the new process capability sets from the capability sets attached * to a file. */ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps, struct linux_binprm *bprm, bool *effective, bool *has_fcap) { struct cred *new = bprm->cred; unsigned i; int ret = 0; if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE) *effective = true; if (caps->magic_etc & VFS_CAP_REVISION_MASK) *has_fcap = true; CAP_FOR_EACH_U32(i) { __u32 permitted = caps->permitted.cap[i]; __u32 inheritable = caps->inheritable.cap[i]; /* * pP' = (X & fP) | (pI & fI) * The addition of pA' is handled later. */ new->cap_permitted.cap[i] = (new->cap_bset.cap[i] & permitted) | (new->cap_inheritable.cap[i] & inheritable); if (permitted & ~new->cap_permitted.cap[i]) /* insufficient to execute correctly */ ret = -EPERM; } /* * For legacy apps, with no internal support for recognizing they * do not have enough capabilities, we return an error if they are * missing some "forced" (aka file-permitted) capabilities. */ return *effective ? ret : 0; } /** * get_vfs_caps_from_disk - retrieve vfs caps from disk * * @mnt_userns: user namespace of the mount the inode was found from * @dentry: dentry from which @inode is retrieved * @cpu_caps: vfs capabilities * * Extract the on-exec-apply capability sets for an executable file. * * If the inode has been found through an idmapped mount the user namespace of * the vfsmount must be passed through @mnt_userns. This function will then * take care to map the inode according to @mnt_userns before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply passs init_user_ns. */ int get_vfs_caps_from_disk(struct user_namespace *mnt_userns, const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps) { struct inode *inode = d_backing_inode(dentry); __u32 magic_etc; unsigned tocopy, i; int size; struct vfs_ns_cap_data data, *nscaps = &data; struct vfs_cap_data *caps = (struct vfs_cap_data *) &data; kuid_t rootkuid; struct user_namespace *fs_ns; memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data)); if (!inode) return -ENODATA; fs_ns = inode->i_sb->s_user_ns; size = __vfs_getxattr((struct dentry *)dentry, inode, XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ); if (size == -ENODATA || size == -EOPNOTSUPP) /* no data, that's ok */ return -ENODATA; if (size < 0) return size; if (size < sizeof(magic_etc)) return -EINVAL; cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc); rootkuid = make_kuid(fs_ns, 0); switch (magic_etc & VFS_CAP_REVISION_MASK) { case VFS_CAP_REVISION_1: if (size != XATTR_CAPS_SZ_1) return -EINVAL; tocopy = VFS_CAP_U32_1; break; case VFS_CAP_REVISION_2: if (size != XATTR_CAPS_SZ_2) return -EINVAL; tocopy = VFS_CAP_U32_2; break; case VFS_CAP_REVISION_3: if (size != XATTR_CAPS_SZ_3) return -EINVAL; tocopy = VFS_CAP_U32_3; rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid)); break; default: return -EINVAL; } /* Limit the caps to the mounter of the filesystem * or the more limited uid specified in the xattr. */ rootkuid = mapped_kuid_fs(mnt_userns, fs_ns, rootkuid); if (!rootid_owns_currentns(rootkuid)) return -ENODATA; CAP_FOR_EACH_U32(i) { if (i >= tocopy) break; cpu_caps->permitted.cap[i] = le32_to_cpu(caps->data[i].permitted); cpu_caps->inheritable.cap[i] = le32_to_cpu(caps->data[i].inheritable); } cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; cpu_caps->rootid = rootkuid; return 0; } /* * Attempt to get the on-exec apply capability sets for an executable file from * its xattrs and, if present, apply them to the proposed credentials being * constructed by execve(). */ static int get_file_caps(struct linux_binprm *bprm, struct file *file, bool *effective, bool *has_fcap) { int rc = 0; struct cpu_vfs_cap_data vcaps; cap_clear(bprm->cred->cap_permitted); if (!file_caps_enabled) return 0; if (!mnt_may_suid(file->f_path.mnt)) return 0; /* * This check is redundant with mnt_may_suid() but is kept to make * explicit that capability bits are limited to s_user_ns and its * descendants. */ if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns)) return 0; rc = get_vfs_caps_from_disk(file_mnt_user_ns(file), file->f_path.dentry, &vcaps); if (rc < 0) { if (rc == -EINVAL) printk(KERN_NOTICE "Invalid argument reading file caps for %s\n", bprm->filename); else if (rc == -ENODATA) rc = 0; goto out; } rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective, has_fcap); out: if (rc) cap_clear(bprm->cred->cap_permitted); return rc; } static inline bool root_privileged(void) { return !issecure(SECURE_NOROOT); } static inline bool __is_real(kuid_t uid, struct cred *cred) { return uid_eq(cred->uid, uid); } static inline bool __is_eff(kuid_t uid, struct cred *cred) { return uid_eq(cred->euid, uid); } static inline bool __is_suid(kuid_t uid, struct cred *cred) { return !__is_real(uid, cred) && __is_eff(uid, cred); } /* * handle_privileged_root - Handle case of privileged root * @bprm: The execution parameters, including the proposed creds * @has_fcap: Are any file capabilities set? * @effective: Do we have effective root privilege? * @root_uid: This namespace' root UID WRT initial USER namespace * * Handle the case where root is privileged and hasn't been neutered by * SECURE_NOROOT. If file capabilities are set, they won't be combined with * set UID root and nothing is changed. If we are root, cap_permitted is * updated. If we have become set UID root, the effective bit is set. */ static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap, bool *effective, kuid_t root_uid) { const struct cred *old = current_cred(); struct cred *new = bprm->cred; if (!root_privileged()) return; /* * If the legacy file capability is set, then don't set privs * for a setuid root binary run by a non-root user. Do set it * for a root user just to cause least surprise to an admin. */ if (has_fcap && __is_suid(root_uid, new)) { warn_setuid_and_fcaps_mixed(bprm->filename); return; } /* * To support inheritance of root-permissions and suid-root * executables under compatibility mode, we override the * capability sets for the file. */ if (__is_eff(root_uid, new) || __is_real(root_uid, new)) { /* pP' = (cap_bset & ~0) | (pI & ~0) */ new->cap_permitted = cap_combine(old->cap_bset, old->cap_inheritable); } /* * If only the real uid is 0, we do not set the effective bit. */ if (__is_eff(root_uid, new)) *effective = true; } #define __cap_gained(field, target, source) \ !cap_issubset(target->cap_##field, source->cap_##field) #define __cap_grew(target, source, cred) \ !cap_issubset(cred->cap_##target, cred->cap_##source) #define __cap_full(field, cred) \ cap_issubset(CAP_FULL_SET, cred->cap_##field) static inline bool __is_setuid(struct cred *new, const struct cred *old) { return !uid_eq(new->euid, old->uid); } static inline bool __is_setgid(struct cred *new, const struct cred *old) { return !gid_eq(new->egid, old->gid); } /* * 1) Audit candidate if current->cap_effective is set * * We do not bother to audit if 3 things are true: * 1) cap_effective has all caps * 2) we became root *OR* are were already root * 3) root is supposed to have all caps (SECURE_NOROOT) * Since this is just a normal root execing a process. * * Number 1 above might fail if you don't have a full bset, but I think * that is interesting information to audit. * * A number of other conditions require logging: * 2) something prevented setuid root getting all caps * 3) non-setuid root gets fcaps * 4) non-setuid root gets ambient */ static inline bool nonroot_raised_pE(struct cred *new, const struct cred *old, kuid_t root, bool has_fcap) { bool ret = false; if ((__cap_grew(effective, ambient, new) && !(__cap_full(effective, new) && (__is_eff(root, new) || __is_real(root, new)) && root_privileged())) || (root_privileged() && __is_suid(root, new) && !__cap_full(effective, new)) || (!__is_setuid(new, old) && ((has_fcap && __cap_gained(permitted, new, old)) || __cap_gained(ambient, new, old)))) ret = true; return ret; } /** * cap_bprm_creds_from_file - Set up the proposed credentials for execve(). * @bprm: The execution parameters, including the proposed creds * @file: The file to pull the credentials from * * Set up the proposed credentials for a new execution context being * constructed by execve(). The proposed creds in @bprm->cred is altered, * which won't take effect immediately. * * Return: 0 if successful, -ve on error. */ int cap_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file) { /* Process setpcap binaries and capabilities for uid 0 */ const struct cred *old = current_cred(); struct cred *new = bprm->cred; bool effective = false, has_fcap = false, is_setid; int ret; kuid_t root_uid; if (WARN_ON(!cap_ambient_invariant_ok(old))) return -EPERM; ret = get_file_caps(bprm, file, &effective, &has_fcap); if (ret < 0) return ret; root_uid = make_kuid(new->user_ns, 0); handle_privileged_root(bprm, has_fcap, &effective, root_uid); /* if we have fs caps, clear dangerous personality flags */ if (__cap_gained(permitted, new, old)) bprm->per_clear |= PER_CLEAR_ON_SETID; /* Don't let someone trace a set[ug]id/setpcap binary with the revised * credentials unless they have the appropriate permit. * * In addition, if NO_NEW_PRIVS, then ensure we get no new privs. */ is_setid = __is_setuid(new, old) || __is_setgid(new, old); if ((is_setid || __cap_gained(permitted, new, old)) && ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) || !ptracer_capable(current, new->user_ns))) { /* downgrade; they get no more than they had, and maybe less */ if (!ns_capable(new->user_ns, CAP_SETUID) || (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) { new->euid = new->uid; new->egid = new->gid; } new->cap_permitted = cap_intersect(new->cap_permitted, old->cap_permitted); } new->suid = new->fsuid = new->euid; new->sgid = new->fsgid = new->egid; /* File caps or setid cancels ambient. */ if (has_fcap || is_setid) cap_clear(new->cap_ambient); /* * Now that we've computed pA', update pP' to give: * pP' = (X & fP) | (pI & fI) | pA' */ new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient); /* * Set pE' = (fE ? pP' : pA'). Because pA' is zero if fE is set, * this is the same as pE' = (fE ? pP' : 0) | pA'. */ if (effective) new->cap_effective = new->cap_permitted; else new->cap_effective = new->cap_ambient; if (WARN_ON(!cap_ambient_invariant_ok(new))) return -EPERM; if (nonroot_raised_pE(new, old, root_uid, has_fcap)) { ret = audit_log_bprm_fcaps(bprm, new, old); if (ret < 0) return ret; } new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); if (WARN_ON(!cap_ambient_invariant_ok(new))) return -EPERM; /* Check for privilege-elevated exec. */ if (is_setid || (!__is_real(root_uid, new) && (effective || __cap_grew(permitted, ambient, new)))) bprm->secureexec = 1; return 0; } /** * cap_inode_setxattr - Determine whether an xattr may be altered * @dentry: The inode/dentry being altered * @name: The name of the xattr to be changed * @value: The value that the xattr will be changed to * @size: The size of value * @flags: The replacement flag * * Determine whether an xattr may be altered or set on an inode, returning 0 if * permission is granted, -ve if denied. * * This is used to make sure security xattrs don't get updated or set by those * who aren't privileged to do so. */ int cap_inode_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct user_namespace *user_ns = dentry->d_sb->s_user_ns; /* Ignore non-security xattrs */ if (strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) != 0) return 0; /* * For XATTR_NAME_CAPS the check will be done in * cap_convert_nscap(), called by setxattr() */ if (strcmp(name, XATTR_NAME_CAPS) == 0) return 0; if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; return 0; } /** * cap_inode_removexattr - Determine whether an xattr may be removed * * @mnt_userns: User namespace of the mount the inode was found from * @dentry: The inode/dentry being altered * @name: The name of the xattr to be changed * * Determine whether an xattr may be removed from an inode, returning 0 if * permission is granted, -ve if denied. * * If the inode has been found through an idmapped mount the user namespace of * the vfsmount must be passed through @mnt_userns. This function will then * take care to map the inode according to @mnt_userns before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply passs init_user_ns. * * This is used to make sure security xattrs don't get removed by those who * aren't privileged to remove them. */ int cap_inode_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry, const char *name) { struct user_namespace *user_ns = dentry->d_sb->s_user_ns; /* Ignore non-security xattrs */ if (strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) != 0) return 0; if (strcmp(name, XATTR_NAME_CAPS) == 0) { /* security.capability gets namespaced */ struct inode *inode = d_backing_inode(dentry); if (!inode) return -EINVAL; if (!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_SETFCAP)) return -EPERM; return 0; } if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; return 0; } /* * cap_emulate_setxuid() fixes the effective / permitted capabilities of * a process after a call to setuid, setreuid, or setresuid. * * 1) When set*uiding _from_ one of {r,e,s}uid == 0 _to_ all of * {r,e,s}uid != 0, the permitted and effective capabilities are * cleared. * * 2) When set*uiding _from_ euid == 0 _to_ euid != 0, the effective * capabilities of the process are cleared. * * 3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective * capabilities are set to the permitted capabilities. * * fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should * never happen. * * -astor * * cevans - New behaviour, Oct '99 * A process may, via prctl(), elect to keep its capabilities when it * calls setuid() and switches away from uid==0. Both permitted and * effective sets will be retained. * Without this change, it was impossible for a daemon to drop only some * of its privilege. The call to setuid(!=0) would drop all privileges! * Keeping uid 0 is not an option because uid 0 owns too many vital * files.. * Thanks to Olaf Kirch and Peter Benie for spotting this. */ static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old) { kuid_t root_uid = make_kuid(old->user_ns, 0); if ((uid_eq(old->uid, root_uid) || uid_eq(old->euid, root_uid) || uid_eq(old->suid, root_uid)) && (!uid_eq(new->uid, root_uid) && !uid_eq(new->euid, root_uid) && !uid_eq(new->suid, root_uid))) { if (!issecure(SECURE_KEEP_CAPS)) { cap_clear(new->cap_permitted); cap_clear(new->cap_effective); } /* * Pre-ambient programs expect setresuid to nonroot followed * by exec to drop capabilities. We should make sure that * this remains the case. */ cap_clear(new->cap_ambient); } if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid)) cap_clear(new->cap_effective); if (!uid_eq(old->euid, root_uid) && uid_eq(new->euid, root_uid)) new->cap_effective = new->cap_permitted; } /** * cap_task_fix_setuid - Fix up the results of setuid() call * @new: The proposed credentials * @old: The current task's current credentials * @flags: Indications of what has changed * * Fix up the results of setuid() call before the credential changes are * actually applied. * * Return: 0 to grant the changes, -ve to deny them. */ int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags) { switch (flags) { case LSM_SETID_RE: case LSM_SETID_ID: case LSM_SETID_RES: /* juggle the capabilities to follow [RES]UID changes unless * otherwise suppressed */ if (!issecure(SECURE_NO_SETUID_FIXUP)) cap_emulate_setxuid(new, old); break; case LSM_SETID_FS: /* juggle the capabilties to follow FSUID changes, unless * otherwise suppressed * * FIXME - is fsuser used for all CAP_FS_MASK capabilities? * if not, we might be a bit too harsh here. */ if (!issecure(SECURE_NO_SETUID_FIXUP)) { kuid_t root_uid = make_kuid(old->user_ns, 0); if (uid_eq(old->fsuid, root_uid) && !uid_eq(new->fsuid, root_uid)) new->cap_effective = cap_drop_fs_set(new->cap_effective); if (!uid_eq(old->fsuid, root_uid) && uid_eq(new->fsuid, root_uid)) new->cap_effective = cap_raise_fs_set(new->cap_effective, new->cap_permitted); } break; default: return -EINVAL; } return 0; } /* * Rationale: code calling task_setscheduler, task_setioprio, and * task_setnice, assumes that * . if capable(cap_sys_nice), then those actions should be allowed * . if not capable(cap_sys_nice), but acting on your own processes, * then those actions should be allowed * This is insufficient now since you can call code without suid, but * yet with increased caps. * So we check for increased caps on the target process. */ static int cap_safe_nice(struct task_struct *p) { int is_subset, ret = 0; rcu_read_lock(); is_subset = cap_issubset(__task_cred(p)->cap_permitted, current_cred()->cap_permitted); if (!is_subset && !ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) ret = -EPERM; rcu_read_unlock(); return ret; } /** * cap_task_setscheduler - Detemine if scheduler policy change is permitted * @p: The task to affect * * Detemine if the requested scheduler policy change is permitted for the * specified task. * * Return: 0 if permission is granted, -ve if denied. */ int cap_task_setscheduler(struct task_struct *p) { return cap_safe_nice(p); } /** * cap_task_setioprio - Detemine if I/O priority change is permitted * @p: The task to affect * @ioprio: The I/O priority to set * * Detemine if the requested I/O priority change is permitted for the specified * task. * * Return: 0 if permission is granted, -ve if denied. */ int cap_task_setioprio(struct task_struct *p, int ioprio) { return cap_safe_nice(p); } /** * cap_task_setnice - Detemine if task priority change is permitted * @p: The task to affect * @nice: The nice value to set * * Detemine if the requested task priority change is permitted for the * specified task. * * Return: 0 if permission is granted, -ve if denied. */ int cap_task_setnice(struct task_struct *p, int nice) { return cap_safe_nice(p); } /* * Implement PR_CAPBSET_DROP. Attempt to remove the specified capability from * the current task's bounding set. Returns 0 on success, -ve on error. */ static int cap_prctl_drop(unsigned long cap) { struct cred *new; if (!ns_capable(current_user_ns(), CAP_SETPCAP)) return -EPERM; if (!cap_valid(cap)) return -EINVAL; new = prepare_creds(); if (!new) return -ENOMEM; cap_lower(new->cap_bset, cap); return commit_creds(new); } /** * cap_task_prctl - Implement process control functions for this security module * @option: The process control function requested * @arg2: The argument data for this function * @arg3: The argument data for this function * @arg4: The argument data for this function * @arg5: The argument data for this function * * Allow process control functions (sys_prctl()) to alter capabilities; may * also deny access to other functions not otherwise implemented here. * * Return: 0 or +ve on success, -ENOSYS if this function is not implemented * here, other -ve on error. If -ENOSYS is returned, sys_prctl() and other LSM * modules will consider performing the function. */ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { const struct cred *old = current_cred(); struct cred *new; switch (option) { case PR_CAPBSET_READ: if (!cap_valid(arg2)) return -EINVAL; return !!cap_raised(old->cap_bset, arg2); case PR_CAPBSET_DROP: return cap_prctl_drop(arg2); /* * The next four prctl's remain to assist with transitioning a * system from legacy UID=0 based privilege (when filesystem * capabilities are not in use) to a system using filesystem * capabilities only - as the POSIX.1e draft intended. * * Note: * * PR_SET_SECUREBITS = * issecure_mask(SECURE_KEEP_CAPS_LOCKED) * | issecure_mask(SECURE_NOROOT) * | issecure_mask(SECURE_NOROOT_LOCKED) * | issecure_mask(SECURE_NO_SETUID_FIXUP) * | issecure_mask(SECURE_NO_SETUID_FIXUP_LOCKED) * * will ensure that the current process and all of its * children will be locked into a pure * capability-based-privilege environment. */ case PR_SET_SECUREBITS: if ((((old->securebits & SECURE_ALL_LOCKS) >> 1) & (old->securebits ^ arg2)) /*[1]*/ || ((old->securebits & SECURE_ALL_LOCKS & ~arg2)) /*[2]*/ || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS)) /*[3]*/ || (cap_capable(current_cred(), current_cred()->user_ns, CAP_SETPCAP, CAP_OPT_NONE) != 0) /*[4]*/ /* * [1] no changing of bits that are locked * [2] no unlocking of locks * [3] no setting of unsupported bits * [4] doing anything requires privilege (go read about * the "sendmail capabilities bug") */ ) /* cannot change a locked bit */ return -EPERM; new = prepare_creds(); if (!new) return -ENOMEM; new->securebits = arg2; return commit_creds(new); case PR_GET_SECUREBITS: return old->securebits; case PR_GET_KEEPCAPS: return !!issecure(SECURE_KEEP_CAPS); case PR_SET_KEEPCAPS: if (arg2 > 1) /* Note, we rely on arg2 being unsigned here */ return -EINVAL; if (issecure(SECURE_KEEP_CAPS_LOCKED)) return -EPERM; new = prepare_creds(); if (!new) return -ENOMEM; if (arg2) new->securebits |= issecure_mask(SECURE_KEEP_CAPS); else new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); return commit_creds(new); case PR_CAP_AMBIENT: if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) { if (arg3 | arg4 | arg5) return -EINVAL; new = prepare_creds(); if (!new) return -ENOMEM; cap_clear(new->cap_ambient); return commit_creds(new); } if (((!cap_valid(arg3)) | arg4 | arg5)) return -EINVAL; if (arg2 == PR_CAP_AMBIENT_IS_SET) { return !!cap_raised(current_cred()->cap_ambient, arg3); } else if (arg2 != PR_CAP_AMBIENT_RAISE && arg2 != PR_CAP_AMBIENT_LOWER) { return -EINVAL; } else { if (arg2 == PR_CAP_AMBIENT_RAISE && (!cap_raised(current_cred()->cap_permitted, arg3) || !cap_raised(current_cred()->cap_inheritable, arg3) || issecure(SECURE_NO_CAP_AMBIENT_RAISE))) return -EPERM; new = prepare_creds(); if (!new) return -ENOMEM; if (arg2 == PR_CAP_AMBIENT_RAISE) cap_raise(new->cap_ambient, arg3); else cap_lower(new->cap_ambient, arg3); return commit_creds(new); } default: /* No functionality available - continue with default */ return -ENOSYS; } } /** * cap_vm_enough_memory - Determine whether a new virtual mapping is permitted * @mm: The VM space in which the new mapping is to be made * @pages: The size of the mapping * * Determine whether the allocation of a new virtual mapping by the current * task is permitted. * * Return: 1 if permission is granted, 0 if not. */ int cap_vm_enough_memory(struct mm_struct *mm, long pages) { int cap_sys_admin = 0; if (cap_capable(current_cred(), &init_user_ns, CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) == 0) cap_sys_admin = 1; return cap_sys_admin; } /** * cap_mmap_addr - check if able to map given addr * @addr: address attempting to be mapped * * If the process is attempting to map memory below dac_mmap_min_addr they need * CAP_SYS_RAWIO. The other parameters to this function are unused by the * capability security module. * * Return: 0 if this mapping should be allowed or -EPERM if not. */ int cap_mmap_addr(unsigned long addr) { int ret = 0; if (addr < dac_mmap_min_addr) { ret = cap_capable(current_cred(), &init_user_ns, CAP_SYS_RAWIO, CAP_OPT_NONE); /* set PF_SUPERPRIV if it turns out we allow the low mmap */ if (ret == 0) current->flags |= PF_SUPERPRIV; } return ret; } int cap_mmap_file(struct file *file, unsigned long reqprot, unsigned long prot, unsigned long flags) { return 0; } #ifdef CONFIG_SECURITY static struct security_hook_list capability_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(capable, cap_capable), LSM_HOOK_INIT(settime, cap_settime), LSM_HOOK_INIT(ptrace_access_check, cap_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, cap_ptrace_traceme), LSM_HOOK_INIT(capget, cap_capget), LSM_HOOK_INIT(capset, cap_capset), LSM_HOOK_INIT(bprm_creds_from_file, cap_bprm_creds_from_file), LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv), LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv), LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity), LSM_HOOK_INIT(mmap_addr, cap_mmap_addr), LSM_HOOK_INIT(mmap_file, cap_mmap_file), LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid), LSM_HOOK_INIT(task_prctl, cap_task_prctl), LSM_HOOK_INIT(task_setscheduler, cap_task_setscheduler), LSM_HOOK_INIT(task_setioprio, cap_task_setioprio), LSM_HOOK_INIT(task_setnice, cap_task_setnice), LSM_HOOK_INIT(vm_enough_memory, cap_vm_enough_memory), }; static int __init capability_init(void) { security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks), "capability"); return 0; } DEFINE_LSM(capability) = { .name = "capability", .order = LSM_ORDER_FIRST, .init = capability_init, }; #endif /* CONFIG_SECURITY */ |
2072 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | /* SPDX-License-Identifier: GPL-2.0 */ /* * x86 KFENCE support. * * Copyright (C) 2020, Google LLC. */ #ifndef _ASM_X86_KFENCE_H #define _ASM_X86_KFENCE_H #ifndef MODULE #include <linux/bug.h> #include <linux/kfence.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> #include <asm/set_memory.h> #include <asm/tlbflush.h> /* Force 4K pages for __kfence_pool. */ static inline bool arch_kfence_init_pool(void) { unsigned long addr; for (addr = (unsigned long)__kfence_pool; is_kfence_address((void *)addr); addr += PAGE_SIZE) { unsigned int level; if (!lookup_address(addr, &level)) return false; if (level != PG_LEVEL_4K) set_memory_4k(addr, 1); } return true; } /* Protect the given page and flush TLB. */ static inline bool kfence_protect_page(unsigned long addr, bool protect) { unsigned int level; pte_t *pte = lookup_address(addr, &level); if (WARN_ON(!pte || level != PG_LEVEL_4K)) return false; /* * We need to avoid IPIs, as we may get KFENCE allocations or faults * with interrupts disabled. Therefore, the below is best-effort, and * does not flush TLBs on all CPUs. We can tolerate some inaccuracy; * lazy fault handling takes care of faults after the page is PRESENT. */ if (protect) set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); else set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); /* * Flush this CPU's TLB, assuming whoever did the allocation/free is * likely to continue running on this CPU. */ preempt_disable(); flush_tlb_one_kernel(addr); preempt_enable(); return true; } #endif /* !MODULE */ #endif /* _ASM_X86_KFENCE_H */ |
1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 | /* * linux/fs/nls/mac-greek.c * * Charset macgreek translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ /* * COPYRIGHT AND PERMISSION NOTICE * * Copyright 1991-2012 Unicode, Inc. All rights reserved. Distributed under * the Terms of Use in http://www.unicode.org/copyright.html. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of the Unicode data files and any associated documentation (the "Data * Files") or Unicode software and any associated documentation (the * "Software") to deal in the Data Files or Software without restriction, * including without limitation the rights to use, copy, modify, merge, * publish, distribute, and/or sell copies of the Data Files or Software, and * to permit persons to whom the Data Files or Software are furnished to do * so, provided that (a) the above copyright notice(s) and this permission * notice appear with all copies of the Data Files or Software, (b) both the * above copyright notice(s) and this permission notice appear in associated * documentation, and (c) there is clear notice in each modified Data File or * in the Software as well as in the documentation associated with the Data * File(s) or Software that the data or software has been modified. * * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR * PERFORMANCE OF THE DATA FILES OR SOFTWARE. * * Except as contained in this notice, the name of a copyright holder shall * not be used in advertising or otherwise to promote the sale, use or other * dealings in these Data Files or Software without prior written * authorization of the copyright holder. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00 */ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10 */ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20 */ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30 */ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40 */ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50 */ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60 */ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70 */ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80 */ 0x00c4, 0x00b9, 0x00b2, 0x00c9, 0x00b3, 0x00d6, 0x00dc, 0x0385, 0x00e0, 0x00e2, 0x00e4, 0x0384, 0x00a8, 0x00e7, 0x00e9, 0x00e8, /* 0x90 */ 0x00ea, 0x00eb, 0x00a3, 0x2122, 0x00ee, 0x00ef, 0x2022, 0x00bd, 0x2030, 0x00f4, 0x00f6, 0x00a6, 0x20ac, 0x00f9, 0x00fb, 0x00fc, /* 0xa0 */ 0x2020, 0x0393, 0x0394, 0x0398, 0x039b, 0x039e, 0x03a0, 0x00df, 0x00ae, 0x00a9, 0x03a3, 0x03aa, 0x00a7, 0x2260, 0x00b0, 0x00b7, /* 0xb0 */ 0x0391, 0x00b1, 0x2264, 0x2265, 0x00a5, 0x0392, 0x0395, 0x0396, 0x0397, 0x0399, 0x039a, 0x039c, 0x03a6, 0x03ab, 0x03a8, 0x03a9, /* 0xc0 */ 0x03ac, 0x039d, 0x00ac, 0x039f, 0x03a1, 0x2248, 0x03a4, 0x00ab, 0x00bb, 0x2026, 0x00a0, 0x03a5, 0x03a7, 0x0386, 0x0388, 0x0153, /* 0xd0 */ 0x2013, 0x2015, 0x201c, 0x201d, 0x2018, 0x2019, 0x00f7, 0x0389, 0x038a, 0x038c, 0x038e, 0x03ad, 0x03ae, 0x03af, 0x03cc, 0x038f, /* 0xe0 */ 0x03cd, 0x03b1, 0x03b2, 0x03c8, 0x03b4, 0x03b5, 0x03c6, 0x03b3, 0x03b7, 0x03b9, 0x03be, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03bf, /* 0xf0 */ 0x03c0, 0x03ce, 0x03c1, 0x03c3, 0x03c4, 0x03b8, 0x03c9, 0x03c2, 0x03c7, 0x03c5, 0x03b6, 0x03ca, 0x03cb, 0x0390, 0x03b0, 0x00ad, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xca, 0x00, 0x00, 0x92, 0x00, 0xb4, 0x9b, 0xac, /* 0xa0-0xa7 */ 0x8c, 0xa9, 0x00, 0xc7, 0xc2, 0xff, 0xa8, 0x00, /* 0xa8-0xaf */ 0xae, 0xb1, 0x82, 0x84, 0x00, 0x00, 0x00, 0xaf, /* 0xb0-0xb7 */ 0x00, 0x81, 0x00, 0xc8, 0x00, 0x97, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x83, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x86, 0x00, 0x00, 0xa7, /* 0xd8-0xdf */ 0x88, 0x00, 0x89, 0x00, 0x8a, 0x00, 0x00, 0x8d, /* 0xe0-0xe7 */ 0x8f, 0x8e, 0x90, 0x91, 0x00, 0x00, 0x94, 0x95, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x99, 0x00, 0x9a, 0xd6, /* 0xf0-0xf7 */ 0x00, 0x9d, 0x00, 0x9e, 0x9f, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page01[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page03[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x8b, 0x87, 0xcd, 0x00, /* 0x80-0x87 */ 0xce, 0xd7, 0xd8, 0x00, 0xd9, 0x00, 0xda, 0xdf, /* 0x88-0x8f */ 0xfd, 0xb0, 0xb5, 0xa1, 0xa2, 0xb6, 0xb7, 0xb8, /* 0x90-0x97 */ 0xa3, 0xb9, 0xba, 0xa4, 0xbb, 0xc1, 0xa5, 0xc3, /* 0x98-0x9f */ 0xa6, 0xc4, 0x00, 0xaa, 0xc6, 0xcb, 0xbc, 0xcc, /* 0xa0-0xa7 */ 0xbe, 0xbf, 0xab, 0xbd, 0xc0, 0xdb, 0xdc, 0xdd, /* 0xa8-0xaf */ 0xfe, 0xe1, 0xe2, 0xe7, 0xe4, 0xe5, 0xfa, 0xe8, /* 0xb0-0xb7 */ 0xf5, 0xe9, 0xeb, 0xec, 0xed, 0xee, 0xea, 0xef, /* 0xb8-0xbf */ 0xf0, 0xf2, 0xf7, 0xf3, 0xf4, 0xf9, 0xe6, 0xf8, /* 0xc0-0xc7 */ 0xe3, 0xf6, 0xfb, 0xfc, 0xde, 0xe0, 0xf1, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page20[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0xd0, 0x00, 0xd1, 0x00, 0x00, /* 0x10-0x17 */ 0xd4, 0xd5, 0x00, 0x00, 0xd2, 0xd3, 0x00, 0x00, /* 0x18-0x1f */ 0xa0, 0x00, 0x96, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page21[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page22[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char *const page_uni2charset[256] = { page00, page01, NULL, page03, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page20, page21, page22, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "macgreek", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_macgreek(void) { return register_nls(&table); } static void __exit exit_nls_macgreek(void) { unregister_nls(&table); } module_init(init_nls_macgreek) module_exit(exit_nls_macgreek) MODULE_LICENSE("Dual BSD/GPL"); |
1 465 39 8 3 7 31 37 465 465 521 300 521 520 287 79 287 236 168 287 287 79 79 31 79 79 79 79 287 22 22 287 287 260 244 207 244 242 244 287 287 287 287 8 20 20 287 287 261 287 219 189 287 11 188 188 287 20 22 22 288 288 1 26 287 261 223 1 79 79 79 79 68 122 523 523 523 187 187 187 187 139 59 79 79 79 79 162 138 137 137 138 88 88 82 82 88 79 79 79 79 79 79 79 481 29 29 29 31 1 30 30 30 31 31 478 486 48 478 82 40 13 31 40 10 8 5 5 5 2 2 60 60 60 60 474 474 28 469 474 474 113 370 370 28 28 465 465 465 6926 6922 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/swapfile.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie */ #include <linux/blkdev.h> #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/slab.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/vmalloc.h> #include <linux/pagemap.h> #include <linux/namei.h> #include <linux/shmem_fs.h> #include <linux/blk-cgroup.h> #include <linux/random.h> #include <linux/writeback.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/init.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/security.h> #include <linux/backing-dev.h> #include <linux/mutex.h> #include <linux/capability.h> #include <linux/syscalls.h> #include <linux/memcontrol.h> #include <linux/poll.h> #include <linux/oom.h> #include <linux/frontswap.h> #include <linux/swapfile.h> #include <linux/export.h> #include <linux/swap_slots.h> #include <linux/sort.h> #include <linux/completion.h> #include <asm/tlbflush.h> #include <linux/swapops.h> #include <linux/swap_cgroup.h> #include "swap.h" #include <trace/hooks/bl_hib.h> static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; atomic_long_t nr_swap_pages; /* * Some modules use swappable objects and may try to swap them out under * memory pressure (via the shrinker). Before doing so, they may wish to * check to see if any swap space is available. */ EXPORT_SYMBOL_GPL(nr_swap_pages); /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; static int least_priority = -1; unsigned long swapfile_maximum_size; #ifdef CONFIG_MIGRATION bool swap_migration_ad_supported; #endif /* CONFIG_MIGRATION */ static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; static const char Unused_offset[] = "Unused swap offset entry "; /* * all active swap_info_structs * protected with swap_lock, and ordered by priority. */ static PLIST_HEAD(swap_active_head); /* * all available (active, not full) swap_info_structs * protected with swap_avail_lock, ordered by priority. * This is used by folio_alloc_swap() instead of swap_active_head * because swap_active_head includes all swap_info_structs, * but folio_alloc_swap() doesn't need to look at full ones. * This uses its own lock instead of swap_lock because when a * swap_info_struct changes between not-full/full, it needs to * add/remove itself to/from this list, but the swap_info_struct->lock * is held and the locking order requires swap_lock to be taken * before any swap_info_struct->lock. */ static struct plist_head *swap_avail_heads; static DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); /* Activity counter to indicate that a swapon or swapoff has occurred */ static atomic_t proc_poll_event = ATOMIC_INIT(0); atomic_t nr_rotate_swap = ATOMIC_INIT(0); static struct swap_info_struct *swap_type_to_swap_info(int type) { if (type >= MAX_SWAPFILES) return NULL; return READ_ONCE(swap_info[type]); /* rcu_dereference() */ } static inline unsigned char swap_count(unsigned char ent) { return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ } /* Reclaim the swap entry anyway if possible */ #define TTRS_ANYWAY 0x1 /* * Reclaim the swap entry if there are no more mappings of the * corresponding page */ #define TTRS_UNMAPPED 0x2 /* Reclaim the swap entry if swap is getting full*/ #define TTRS_FULL 0x4 /* returns 1 if swap entry is freed */ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset, unsigned long flags) { swp_entry_t entry = swp_entry(si->type, offset); struct folio *folio; int ret = 0; folio = filemap_get_folio(swap_address_space(entry), offset); if (!folio) return 0; /* * When this function is called from scan_swap_map_slots() and it's * called by vmscan.c at reclaiming folios. So we hold a folio lock * here. We have to use trylock for avoiding deadlock. This is a special * case and you should use folio_free_swap() with explicit folio_lock() * in usual operations. */ if (folio_trylock(folio)) { if ((flags & TTRS_ANYWAY) || ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))) ret = folio_free_swap(folio); folio_unlock(folio); } folio_put(folio); return ret; } static inline struct swap_extent *first_se(struct swap_info_struct *sis) { struct rb_node *rb = rb_first(&sis->swap_extent_root); return rb_entry(rb, struct swap_extent, rb_node); } static inline struct swap_extent *next_se(struct swap_extent *se) { struct rb_node *rb = rb_next(&se->rb_node); return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL; } /* * swapon tell device that all the old swap contents can be discarded, * to allow the swap device to optimize its wear-levelling. */ static int discard_swap(struct swap_info_struct *si) { struct swap_extent *se; sector_t start_block; sector_t nr_blocks; int err = 0; /* Do not discard the swap header page! */ se = first_se(si); start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); if (nr_blocks) { err = blkdev_issue_discard(si->bdev, start_block, nr_blocks, GFP_KERNEL); if (err) return err; cond_resched(); } for (se = next_se(se); se; se = next_se(se)) { start_block = se->start_block << (PAGE_SHIFT - 9); nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); err = blkdev_issue_discard(si->bdev, start_block, nr_blocks, GFP_KERNEL); if (err) break; cond_resched(); } return err; /* That will often be -EOPNOTSUPP */ } static struct swap_extent * offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) { struct swap_extent *se; struct rb_node *rb; rb = sis->swap_extent_root.rb_node; while (rb) { se = rb_entry(rb, struct swap_extent, rb_node); if (offset < se->start_page) rb = rb->rb_left; else if (offset >= se->start_page + se->nr_pages) rb = rb->rb_right; else return se; } /* It *must* be present */ BUG(); } sector_t swap_page_sector(struct page *page) { struct swap_info_struct *sis = page_swap_info(page); struct swap_extent *se; sector_t sector; pgoff_t offset; offset = __page_file_index(page); se = offset_to_swap_extent(sis, offset); sector = se->start_block + (offset - se->start_page); return sector << (PAGE_SHIFT - 9); } /* * swap allocation tell device that a cluster of swap can now be discarded, * to allow the swap device to optimize its wear-levelling. */ static void discard_swap_cluster(struct swap_info_struct *si, pgoff_t start_page, pgoff_t nr_pages) { struct swap_extent *se = offset_to_swap_extent(si, start_page); while (nr_pages) { pgoff_t offset = start_page - se->start_page; sector_t start_block = se->start_block + offset; sector_t nr_blocks = se->nr_pages - offset; if (nr_blocks > nr_pages) nr_blocks = nr_pages; start_page += nr_blocks; nr_pages -= nr_blocks; start_block <<= PAGE_SHIFT - 9; nr_blocks <<= PAGE_SHIFT - 9; if (blkdev_issue_discard(si->bdev, start_block, nr_blocks, GFP_NOIO)) break; se = next_se(se); } } #ifdef CONFIG_THP_SWAP #define SWAPFILE_CLUSTER HPAGE_PMD_NR #define swap_entry_size(size) (size) #else #define SWAPFILE_CLUSTER 256 /* * Define swap_entry_size() as constant to let compiler to optimize * out some code if !CONFIG_THP_SWAP */ #define swap_entry_size(size) 1 #endif #define LATENCY_LIMIT 256 static inline void cluster_set_flag(struct swap_cluster_info *info, unsigned int flag) { info->flags = flag; } static inline unsigned int cluster_count(struct swap_cluster_info *info) { return info->data; } static inline void cluster_set_count(struct swap_cluster_info *info, unsigned int c) { info->data = c; } static inline void cluster_set_count_flag(struct swap_cluster_info *info, unsigned int c, unsigned int f) { info->flags = f; info->data = c; } static inline unsigned int cluster_next(struct swap_cluster_info *info) { return info->data; } static inline void cluster_set_next(struct swap_cluster_info *info, unsigned int n) { info->data = n; } static inline void cluster_set_next_flag(struct swap_cluster_info *info, unsigned int n, unsigned int f) { info->flags = f; info->data = n; } static inline bool cluster_is_free(struct swap_cluster_info *info) { return info->flags & CLUSTER_FLAG_FREE; } static inline bool cluster_is_null(struct swap_cluster_info *info) { return info->flags & CLUSTER_FLAG_NEXT_NULL; } static inline void cluster_set_null(struct swap_cluster_info *info) { info->flags = CLUSTER_FLAG_NEXT_NULL; info->data = 0; } static inline bool cluster_is_huge(struct swap_cluster_info *info) { if (IS_ENABLED(CONFIG_THP_SWAP)) return info->flags & CLUSTER_FLAG_HUGE; return false; } static inline void cluster_clear_huge(struct swap_cluster_info *info) { info->flags &= ~CLUSTER_FLAG_HUGE; } static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, unsigned long offset) { struct swap_cluster_info *ci; ci = si->cluster_info; if (ci) { ci += offset / SWAPFILE_CLUSTER; spin_lock(&ci->lock); } return ci; } static inline void unlock_cluster(struct swap_cluster_info *ci) { if (ci) spin_unlock(&ci->lock); } /* * Determine the locking method in use for this device. Return * swap_cluster_info if SSD-style cluster-based locking is in place. */ static inline struct swap_cluster_info *lock_cluster_or_swap_info( struct swap_info_struct *si, unsigned long offset) { struct swap_cluster_info *ci; /* Try to use fine-grained SSD-style locking if available: */ ci = lock_cluster(si, offset); /* Otherwise, fall back to traditional, coarse locking: */ if (!ci) spin_lock(&si->lock); return ci; } static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si, struct swap_cluster_info *ci) { if (ci) unlock_cluster(ci); else spin_unlock(&si->lock); } static inline bool cluster_list_empty(struct swap_cluster_list *list) { return cluster_is_null(&list->head); } static inline unsigned int cluster_list_first(struct swap_cluster_list *list) { return cluster_next(&list->head); } static void cluster_list_init(struct swap_cluster_list *list) { cluster_set_null(&list->head); cluster_set_null(&list->tail); } static void cluster_list_add_tail(struct swap_cluster_list *list, struct swap_cluster_info *ci, unsigned int idx) { if (cluster_list_empty(list)) { cluster_set_next_flag(&list->head, idx, 0); cluster_set_next_flag(&list->tail, idx, 0); } else { struct swap_cluster_info *ci_tail; unsigned int tail = cluster_next(&list->tail); /* * Nested cluster lock, but both cluster locks are * only acquired when we held swap_info_struct->lock */ ci_tail = ci + tail; spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); cluster_set_next(ci_tail, idx); spin_unlock(&ci_tail->lock); cluster_set_next_flag(&list->tail, idx, 0); } } static unsigned int cluster_list_del_first(struct swap_cluster_list *list, struct swap_cluster_info *ci) { unsigned int idx; idx = cluster_next(&list->head); if (cluster_next(&list->tail) == idx) { cluster_set_null(&list->head); cluster_set_null(&list->tail); } else cluster_set_next_flag(&list->head, cluster_next(&ci[idx]), 0); return idx; } /* Add a cluster to discard list and schedule it to do discard */ static void swap_cluster_schedule_discard(struct swap_info_struct *si, unsigned int idx) { /* * If scan_swap_map_slots() can't find a free cluster, it will check * si->swap_map directly. To make sure the discarding cluster isn't * taken by scan_swap_map_slots(), mark the swap entries bad (occupied). * It will be cleared after discard */ memset(si->swap_map + idx * SWAPFILE_CLUSTER, SWAP_MAP_BAD, SWAPFILE_CLUSTER); cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); schedule_work(&si->discard_work); } static void __free_cluster(struct swap_info_struct *si, unsigned long idx) { struct swap_cluster_info *ci = si->cluster_info; cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE); cluster_list_add_tail(&si->free_clusters, ci, idx); } /* * Doing discard actually. After a cluster discard is finished, the cluster * will be added to free cluster list. caller should hold si->lock. */ static void swap_do_scheduled_discard(struct swap_info_struct *si) { struct swap_cluster_info *info, *ci; unsigned int idx; info = si->cluster_info; while (!cluster_list_empty(&si->discard_clusters)) { idx = cluster_list_del_first(&si->discard_clusters, info); spin_unlock(&si->lock); discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, SWAPFILE_CLUSTER); spin_lock(&si->lock); ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); __free_cluster(si, idx); memset(si->swap_map + idx * SWAPFILE_CLUSTER, 0, SWAPFILE_CLUSTER); unlock_cluster(ci); } } static void swap_discard_work(struct work_struct *work) { struct swap_info_struct *si; si = container_of(work, struct swap_info_struct, discard_work); spin_lock(&si->lock); swap_do_scheduled_discard(si); spin_unlock(&si->lock); } static void swap_users_ref_free(struct percpu_ref *ref) { struct swap_info_struct *si; si = container_of(ref, struct swap_info_struct, users); complete(&si->comp); } static void alloc_cluster(struct swap_info_struct *si, unsigned long idx) { struct swap_cluster_info *ci = si->cluster_info; VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx); cluster_list_del_first(&si->free_clusters, ci); cluster_set_count_flag(ci + idx, 0, 0); } static void free_cluster(struct swap_info_struct *si, unsigned long idx) { struct swap_cluster_info *ci = si->cluster_info + idx; VM_BUG_ON(cluster_count(ci) != 0); /* * If the swap is discardable, prepare discard the cluster * instead of free it immediately. The cluster will be freed * after discard. */ if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == (SWP_WRITEOK | SWP_PAGE_DISCARD)) { swap_cluster_schedule_discard(si, idx); return; } __free_cluster(si, idx); } /* * The cluster corresponding to page_nr will be used. The cluster will be * removed from free cluster list and its usage counter will be increased. */ static void inc_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *cluster_info, unsigned long page_nr) { unsigned long idx = page_nr / SWAPFILE_CLUSTER; if (!cluster_info) return; if (cluster_is_free(&cluster_info[idx])) alloc_cluster(p, idx); VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER); cluster_set_count(&cluster_info[idx], cluster_count(&cluster_info[idx]) + 1); } /* * The cluster corresponding to page_nr decreases one usage. If the usage * counter becomes 0, which means no page in the cluster is in using, we can * optionally discard the cluster and add it to free cluster list. */ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *cluster_info, unsigned long page_nr) { unsigned long idx = page_nr / SWAPFILE_CLUSTER; if (!cluster_info) return; VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0); cluster_set_count(&cluster_info[idx], cluster_count(&cluster_info[idx]) - 1); if (cluster_count(&cluster_info[idx]) == 0) free_cluster(p, idx); } /* * It's possible scan_swap_map_slots() uses a free cluster in the middle of free * cluster list. Avoiding such abuse to avoid list corruption. */ static bool scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, unsigned long offset) { struct percpu_cluster *percpu_cluster; bool conflict; offset /= SWAPFILE_CLUSTER; conflict = !cluster_list_empty(&si->free_clusters) && offset != cluster_list_first(&si->free_clusters) && cluster_is_free(&si->cluster_info[offset]); if (!conflict) return false; percpu_cluster = this_cpu_ptr(si->percpu_cluster); cluster_set_null(&percpu_cluster->index); return true; } /* * Try to get a swap entry from current cpu's swap entry pool (a cluster). This * might involve allocating a new cluster for current CPU too. */ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, unsigned long *offset, unsigned long *scan_base) { struct percpu_cluster *cluster; struct swap_cluster_info *ci; unsigned long tmp, max; new_cluster: cluster = this_cpu_ptr(si->percpu_cluster); if (cluster_is_null(&cluster->index)) { if (!cluster_list_empty(&si->free_clusters)) { cluster->index = si->free_clusters.head; cluster->next = cluster_next(&cluster->index) * SWAPFILE_CLUSTER; } else if (!cluster_list_empty(&si->discard_clusters)) { /* * we don't have free cluster but have some clusters in * discarding, do discard now and reclaim them, then * reread cluster_next_cpu since we dropped si->lock */ swap_do_scheduled_discard(si); *scan_base = this_cpu_read(*si->cluster_next_cpu); *offset = *scan_base; goto new_cluster; } else return false; } /* * Other CPUs can use our cluster if they can't find a free cluster, * check if there is still free entry in the cluster */ tmp = cluster->next; max = min_t(unsigned long, si->max, (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER); if (tmp < max) { ci = lock_cluster(si, tmp); while (tmp < max) { if (!si->swap_map[tmp]) break; tmp++; } unlock_cluster(ci); } if (tmp >= max) { cluster_set_null(&cluster->index); goto new_cluster; } cluster->next = tmp + 1; *offset = tmp; *scan_base = tmp; return true; } static void __del_from_avail_list(struct swap_info_struct *p) { int nid; assert_spin_locked(&p->lock); for_each_node(nid) plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]); } static void del_from_avail_list(struct swap_info_struct *p) { spin_lock(&swap_avail_lock); __del_from_avail_list(p); spin_unlock(&swap_avail_lock); } static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, unsigned int nr_entries) { unsigned int end = offset + nr_entries - 1; if (offset == si->lowest_bit) si->lowest_bit += nr_entries; if (end == si->highest_bit) WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries); WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries); if (si->inuse_pages == si->pages) { si->lowest_bit = si->max; si->highest_bit = 0; del_from_avail_list(si); } } static void add_to_avail_list(struct swap_info_struct *p) { int nid; spin_lock(&swap_avail_lock); for_each_node(nid) { WARN_ON(!plist_node_empty(&p->avail_lists[nid])); plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]); } spin_unlock(&swap_avail_lock); } static void swap_range_free(struct swap_info_struct *si, unsigned long offset, unsigned int nr_entries) { unsigned long begin = offset; unsigned long end = offset + nr_entries - 1; void (*swap_slot_free_notify)(struct block_device *, unsigned long); if (offset < si->lowest_bit) si->lowest_bit = offset; if (end > si->highest_bit) { bool was_full = !si->highest_bit; WRITE_ONCE(si->highest_bit, end); if (was_full && (si->flags & SWP_WRITEOK)) add_to_avail_list(si); } atomic_long_add(nr_entries, &nr_swap_pages); WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); if (si->flags & SWP_BLKDEV) swap_slot_free_notify = si->bdev->bd_disk->fops->swap_slot_free_notify; else swap_slot_free_notify = NULL; while (offset <= end) { arch_swap_invalidate_page(si->type, offset); frontswap_invalidate_page(si->type, offset); if (swap_slot_free_notify) swap_slot_free_notify(si->bdev, offset); offset++; } clear_shadow_from_swap_cache(si->type, begin, end); } static void set_cluster_next(struct swap_info_struct *si, unsigned long next) { unsigned long prev; if (!(si->flags & SWP_SOLIDSTATE)) { si->cluster_next = next; return; } prev = this_cpu_read(*si->cluster_next_cpu); /* * Cross the swap address space size aligned trunk, choose * another trunk randomly to avoid lock contention on swap * address space if possible. */ if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) != (next >> SWAP_ADDRESS_SPACE_SHIFT)) { /* No free swap slots available */ if (si->highest_bit <= si->lowest_bit) return; next = si->lowest_bit + prandom_u32_max(si->highest_bit - si->lowest_bit + 1); next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES); next = max_t(unsigned int, next, si->lowest_bit); } this_cpu_write(*si->cluster_next_cpu, next); } static bool swap_offset_available_and_locked(struct swap_info_struct *si, unsigned long offset) { if (data_race(!si->swap_map[offset])) { spin_lock(&si->lock); return true; } if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { spin_lock(&si->lock); return true; } return false; } static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[]) { struct swap_cluster_info *ci; unsigned long offset; unsigned long scan_base; unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; int n_ret = 0; bool scanned_many = false; /* * We try to cluster swap pages by allocating them sequentially * in swap. Once we've allocated SWAPFILE_CLUSTER pages this * way, however, we resort to first-free allocation, starting * a new cluster. This prevents us from scattering swap pages * all over the entire swap partition, so that we reduce * overall disk seek times between swap pages. -- sct * But we do now try to find an empty cluster. -Andrea * And we let swap pages go all over an SSD partition. Hugh */ si->flags += SWP_SCANNING; /* * Use percpu scan base for SSD to reduce lock contention on * cluster and swap cache. For HDD, sequential access is more * important. */ if (si->flags & SWP_SOLIDSTATE) scan_base = this_cpu_read(*si->cluster_next_cpu); else scan_base = si->cluster_next; offset = scan_base; /* SSD algorithm */ if (si->cluster_info) { if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) goto scan; } else if (unlikely(!si->cluster_nr--)) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; } spin_unlock(&si->lock); /* * If seek is expensive, start searching for new cluster from * start of partition, to minimize the span of allocated swap. * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info * case, just handled by scan_swap_map_try_ssd_cluster() above. */ scan_base = offset = si->lowest_bit; last_in_cluster = offset + SWAPFILE_CLUSTER - 1; /* Locate the first empty (unaligned) cluster */ for (; last_in_cluster <= si->highest_bit; offset++) { if (si->swap_map[offset]) last_in_cluster = offset + SWAPFILE_CLUSTER; else if (offset == last_in_cluster) { spin_lock(&si->lock); offset -= SWAPFILE_CLUSTER - 1; si->cluster_next = offset; si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; } if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; } } offset = scan_base; spin_lock(&si->lock); si->cluster_nr = SWAPFILE_CLUSTER - 1; } checks: if (si->cluster_info) { while (scan_swap_map_ssd_cluster_conflict(si, offset)) { /* take a break if we already got some slots */ if (n_ret) goto done; if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) goto scan; } } if (!(si->flags & SWP_WRITEOK)) goto no_page; if (!si->highest_bit) goto no_page; if (offset > si->highest_bit) scan_base = offset = si->lowest_bit; ci = lock_cluster(si, offset); /* reuse swap entry of cache-only swap if not busy. */ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { int swap_was_freed; unlock_cluster(ci); spin_unlock(&si->lock); swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); spin_lock(&si->lock); /* entry was freed successfully, try to use this again */ if (swap_was_freed) goto checks; goto scan; /* check next one */ } if (si->swap_map[offset]) { unlock_cluster(ci); if (!n_ret) goto scan; else goto done; } WRITE_ONCE(si->swap_map[offset], usage); inc_cluster_info_page(si, si->cluster_info, offset); unlock_cluster(ci); swap_range_alloc(si, offset, 1); slots[n_ret++] = swp_entry(si->type, offset); /* got enough slots or reach max slots? */ if ((n_ret == nr) || (offset >= si->highest_bit)) goto done; /* search for next available slot */ /* time to take a break? */ if (unlikely(--latency_ration < 0)) { if (n_ret) goto done; spin_unlock(&si->lock); cond_resched(); spin_lock(&si->lock); latency_ration = LATENCY_LIMIT; } /* try to get more slots in cluster */ if (si->cluster_info) { if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) goto checks; } else if (si->cluster_nr && !si->swap_map[++offset]) { /* non-ssd case, still more slots in cluster? */ --si->cluster_nr; goto checks; } /* * Even if there's no free clusters available (fragmented), * try to scan a little more quickly with lock held unless we * have scanned too many slots already. */ if (!scanned_many) { unsigned long scan_limit; if (offset < scan_base) scan_limit = scan_base; else scan_limit = si->highest_bit; for (; offset <= scan_limit && --latency_ration > 0; offset++) { if (!si->swap_map[offset]) goto checks; } } done: set_cluster_next(si, offset + 1); si->flags -= SWP_SCANNING; return n_ret; scan: spin_unlock(&si->lock); while (++offset <= READ_ONCE(si->highest_bit)) { if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; scanned_many = true; } if (swap_offset_available_and_locked(si, offset)) goto checks; } offset = si->lowest_bit; while (offset < scan_base) { if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; scanned_many = true; } if (swap_offset_available_and_locked(si, offset)) goto checks; offset++; } spin_lock(&si->lock); no_page: si->flags -= SWP_SCANNING; return n_ret; } static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) { unsigned long idx; struct swap_cluster_info *ci; unsigned long offset; /* * Should not even be attempting cluster allocations when huge * page swap is disabled. Warn and fail the allocation. */ if (!IS_ENABLED(CONFIG_THP_SWAP)) { VM_WARN_ON_ONCE(1); return 0; } if (cluster_list_empty(&si->free_clusters)) return 0; idx = cluster_list_first(&si->free_clusters); offset = idx * SWAPFILE_CLUSTER; ci = lock_cluster(si, offset); alloc_cluster(si, idx); cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE); memset(si->swap_map + offset, SWAP_HAS_CACHE, SWAPFILE_CLUSTER); unlock_cluster(ci); swap_range_alloc(si, offset, SWAPFILE_CLUSTER); *slot = swp_entry(si->type, offset); return 1; } static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) { unsigned long offset = idx * SWAPFILE_CLUSTER; struct swap_cluster_info *ci; ci = lock_cluster(si, offset); memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); cluster_set_count_flag(ci, 0, 0); free_cluster(si, idx); unlock_cluster(ci); swap_range_free(si, offset, SWAPFILE_CLUSTER); } int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) { unsigned long size = swap_entry_size(entry_size); struct swap_info_struct *si, *next; long avail_pgs; int n_ret = 0; int node; /* Only single cluster request supported */ WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER); spin_lock(&swap_avail_lock); avail_pgs = atomic_long_read(&nr_swap_pages) / size; if (avail_pgs <= 0) { spin_unlock(&swap_avail_lock); goto noswap; } n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs); atomic_long_sub(n_goal * size, &nr_swap_pages); start_over: node = numa_node_id(); plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { /* requeue si to after same-priority siblings */ plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); spin_unlock(&swap_avail_lock); spin_lock(&si->lock); if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { spin_lock(&swap_avail_lock); if (plist_node_empty(&si->avail_lists[node])) { spin_unlock(&si->lock); goto nextsi; } WARN(!si->highest_bit, "swap_info %d in list but !highest_bit\n", si->type); WARN(!(si->flags & SWP_WRITEOK), "swap_info %d in list but !SWP_WRITEOK\n", si->type); __del_from_avail_list(si); spin_unlock(&si->lock); goto nextsi; } if (size == SWAPFILE_CLUSTER) { if (si->flags & SWP_BLKDEV) n_ret = swap_alloc_cluster(si, swp_entries); } else n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, n_goal, swp_entries); spin_unlock(&si->lock); if (n_ret || size == SWAPFILE_CLUSTER) goto check_out; pr_debug("scan_swap_map of si %d failed to find offset\n", si->type); cond_resched(); spin_lock(&swap_avail_lock); nextsi: /* * if we got here, it's likely that si was almost full before, * and since scan_swap_map_slots() can drop the si->lock, * multiple callers probably all tried to get a page from the * same si and it filled up before we could get one; or, the si * filled up between us dropping swap_avail_lock and taking * si->lock. Since we dropped the swap_avail_lock, the * swap_avail_head list may have been modified; so if next is * still in the swap_avail_head list then try it, otherwise * start over if we have not gotten any slots. */ if (plist_node_empty(&next->avail_lists[node])) goto start_over; } spin_unlock(&swap_avail_lock); check_out: if (n_ret < n_goal) atomic_long_add((long)(n_goal - n_ret) * size, &nr_swap_pages); noswap: return n_ret; } static struct swap_info_struct *_swap_info_get(swp_entry_t entry) { struct swap_info_struct *p; unsigned long offset; if (!entry.val) goto out; p = swp_swap_info(entry); if (!p) goto bad_nofile; if (data_race(!(p->flags & SWP_USED))) goto bad_device; offset = swp_offset(entry); if (offset >= p->max) goto bad_offset; if (data_race(!p->swap_map[swp_offset(entry)])) goto bad_free; return p; bad_free: pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val); goto out; bad_offset: pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); goto out; bad_device: pr_err("%s: %s%08lx\n", __func__, Unused_file, entry.val); goto out; bad_nofile: pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); out: return NULL; } static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, struct swap_info_struct *q) { struct swap_info_struct *p; p = _swap_info_get(entry); if (p != q) { if (q != NULL) spin_unlock(&q->lock); if (p != NULL) spin_lock(&p->lock); } return p; } static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, unsigned long offset, unsigned char usage) { unsigned char count; unsigned char has_cache; count = p->swap_map[offset]; has_cache = count & SWAP_HAS_CACHE; count &= ~SWAP_HAS_CACHE; if (usage == SWAP_HAS_CACHE) { VM_BUG_ON(!has_cache); has_cache = 0; } else if (count == SWAP_MAP_SHMEM) { /* * Or we could insist on shmem.c using a special * swap_shmem_free() and free_shmem_swap_and_cache()... */ count = 0; } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { if (count == COUNT_CONTINUED) { if (swap_count_continued(p, offset, count)) count = SWAP_MAP_MAX | COUNT_CONTINUED; else count = SWAP_MAP_MAX; } else count--; } usage = count | has_cache; if (usage) WRITE_ONCE(p->swap_map[offset], usage); else WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE); return usage; } /* * When we get a swap entry, if there aren't some other ways to * prevent swapoff, such as the folio in swap cache is locked, page * table lock is held, etc., the swap entry may become invalid because * of swapoff. Then, we need to enclose all swap related functions * with get_swap_device() and put_swap_device(), unless the swap * functions call get/put_swap_device() by themselves. * * Note that when only holding the PTL, swapoff might succeed immediately * after freeing a swap entry. Therefore, immediately after * __swap_entry_free(), the swap info might become stale and should not * be touched without a prior get_swap_device(). * * Check whether swap entry is valid in the swap device. If so, * return pointer to swap_info_struct, and keep the swap entry valid * via preventing the swap device from being swapoff, until * put_swap_device() is called. Otherwise return NULL. * * Notice that swapoff or swapoff+swapon can still happen before the * percpu_ref_tryget_live() in get_swap_device() or after the * percpu_ref_put() in put_swap_device() if there isn't any other way * to prevent swapoff. The caller must be prepared for that. For * example, the following situation is possible. * * CPU1 CPU2 * do_swap_page() * ... swapoff+swapon * __read_swap_cache_async() * swapcache_prepare() * __swap_duplicate() * // check swap_map * // verify PTE not changed * * In __swap_duplicate(), the swap_map need to be checked before * changing partly because the specified swap entry may be for another * swap device which has been swapoff. And in do_swap_page(), after * the page is read from the swap device, the PTE is verified not * changed with the page table locked to check whether the swap device * has been swapoff or swapoff+swapon. */ struct swap_info_struct *get_swap_device(swp_entry_t entry) { struct swap_info_struct *si; unsigned long offset; if (!entry.val) goto out; si = swp_swap_info(entry); if (!si) goto bad_nofile; if (!percpu_ref_tryget_live(&si->users)) goto out; /* * Guarantee the si->users are checked before accessing other * fields of swap_info_struct. * * Paired with the spin_unlock() after setup_swap_info() in * enable_swap_info(). */ smp_rmb(); offset = swp_offset(entry); if (offset >= si->max) goto put_out; return si; bad_nofile: pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); out: return NULL; put_out: pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); percpu_ref_put(&si->users); return NULL; } static unsigned char __swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) { struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); unsigned char usage; ci = lock_cluster_or_swap_info(p, offset); usage = __swap_entry_free_locked(p, offset, 1); unlock_cluster_or_swap_info(p, ci); if (!usage) free_swap_slot(entry); return usage; } static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) { struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); unsigned char count; ci = lock_cluster(p, offset); count = p->swap_map[offset]; VM_BUG_ON(count != SWAP_HAS_CACHE); p->swap_map[offset] = 0; dec_cluster_info_page(p, p->cluster_info, offset); unlock_cluster(ci); mem_cgroup_uncharge_swap(entry, 1); swap_range_free(p, offset, 1); } /* * Caller has made sure that the swap device corresponding to entry * is still around or has not been recycled. */ void swap_free(swp_entry_t entry) { struct swap_info_struct *p; p = _swap_info_get(entry); if (p) __swap_entry_free(p, entry); } /* * Called after dropping swapcache to decrease refcnt to swap entries. */ void put_swap_folio(struct folio *folio, swp_entry_t entry) { unsigned long offset = swp_offset(entry); unsigned long idx = offset / SWAPFILE_CLUSTER; struct swap_cluster_info *ci; struct swap_info_struct *si; unsigned char *map; unsigned int i, free_entries = 0; unsigned char val; int size = swap_entry_size(folio_nr_pages(folio)); si = _swap_info_get(entry); if (!si) return; ci = lock_cluster_or_swap_info(si, offset); if (size == SWAPFILE_CLUSTER) { VM_BUG_ON(!cluster_is_huge(ci)); map = si->swap_map + offset; for (i = 0; i < SWAPFILE_CLUSTER; i++) { val = map[i]; VM_BUG_ON(!(val & SWAP_HAS_CACHE)); if (val == SWAP_HAS_CACHE) free_entries++; } cluster_clear_huge(ci); if (free_entries == SWAPFILE_CLUSTER) { unlock_cluster_or_swap_info(si, ci); spin_lock(&si->lock); mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); swap_free_cluster(si, idx); spin_unlock(&si->lock); return; } } for (i = 0; i < size; i++, entry.val++) { if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { unlock_cluster_or_swap_info(si, ci); free_swap_slot(entry); if (i == size - 1) return; lock_cluster_or_swap_info(si, offset); } } unlock_cluster_or_swap_info(si, ci); } #ifdef CONFIG_THP_SWAP int split_swap_cluster(swp_entry_t entry) { struct swap_info_struct *si; struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); si = _swap_info_get(entry); if (!si) return -EBUSY; ci = lock_cluster(si, offset); cluster_clear_huge(ci); unlock_cluster(ci); return 0; } #endif static int swp_entry_cmp(const void *ent1, const void *ent2) { const swp_entry_t *e1 = ent1, *e2 = ent2; return (int)swp_type(*e1) - (int)swp_type(*e2); } void swapcache_free_entries(swp_entry_t *entries, int n) { struct swap_info_struct *p, *prev; int i; if (n <= 0) return; prev = NULL; p = NULL; /* * Sort swap entries by swap device, so each lock is only taken once. * nr_swapfiles isn't absolutely correct, but the overhead of sort() is * so low that it isn't necessary to optimize further. */ if (nr_swapfiles > 1) sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL); for (i = 0; i < n; ++i) { p = swap_info_get_cont(entries[i], prev); if (p) swap_entry_free(p, entries[i]); prev = p; } if (p) spin_unlock(&p->lock); } int __swap_count(swp_entry_t entry) { struct swap_info_struct *si; pgoff_t offset = swp_offset(entry); int count = 0; si = get_swap_device(entry); if (si) { count = swap_count(si->swap_map[offset]); put_swap_device(si); } return count; } /* * How many references to @entry are currently swapped out? * This does not give an exact answer when swap count is continued, * but does include the high COUNT_CONTINUED flag to allow for that. */ static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) { pgoff_t offset = swp_offset(entry); struct swap_cluster_info *ci; int count; ci = lock_cluster_or_swap_info(si, offset); count = swap_count(si->swap_map[offset]); unlock_cluster_or_swap_info(si, ci); return count; } /* * How many references to @entry are currently swapped out? * This does not give an exact answer when swap count is continued, * but does include the high COUNT_CONTINUED flag to allow for that. */ int __swp_swapcount(swp_entry_t entry) { int count = 0; struct swap_info_struct *si; si = get_swap_device(entry); if (si) { count = swap_swapcount(si, entry); put_swap_device(si); } return count; } /* * How many references to @entry are currently swapped out? * This considers COUNT_CONTINUED so it returns exact answer. */ int swp_swapcount(swp_entry_t entry) { int count, tmp_count, n; struct swap_info_struct *p; struct swap_cluster_info *ci; struct page *page; pgoff_t offset; unsigned char *map; p = _swap_info_get(entry); if (!p) return 0; offset = swp_offset(entry); ci = lock_cluster_or_swap_info(p, offset); count = swap_count(p->swap_map[offset]); if (!(count & COUNT_CONTINUED)) goto out; count &= ~COUNT_CONTINUED; n = SWAP_MAP_MAX + 1; page = vmalloc_to_page(p->swap_map + offset); offset &= ~PAGE_MASK; VM_BUG_ON(page_private(page) != SWP_CONTINUED); do { page = list_next_entry(page, lru); map = kmap_atomic(page); tmp_count = map[offset]; kunmap_atomic(map); count += (tmp_count & ~COUNT_CONTINUED) * n; n *= (SWAP_CONT_MAX + 1); } while (tmp_count & COUNT_CONTINUED); out: unlock_cluster_or_swap_info(p, ci); return count; } static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, swp_entry_t entry) { struct swap_cluster_info *ci; unsigned char *map = si->swap_map; unsigned long roffset = swp_offset(entry); unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER); int i; bool ret = false; ci = lock_cluster_or_swap_info(si, offset); if (!ci || !cluster_is_huge(ci)) { if (swap_count(map[roffset])) ret = true; goto unlock_out; } for (i = 0; i < SWAPFILE_CLUSTER; i++) { if (swap_count(map[offset + i])) { ret = true; break; } } unlock_out: unlock_cluster_or_swap_info(si, ci); return ret; } static bool folio_swapped(struct folio *folio) { swp_entry_t entry = folio_swap_entry(folio); struct swap_info_struct *si = _swap_info_get(entry); if (!si) return false; if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio))) return swap_swapcount(si, entry) != 0; return swap_page_trans_huge_swapped(si, entry); } /** * folio_free_swap() - Free the swap space used for this folio. * @folio: The folio to remove. * * If swap is getting full, or if there are no more mappings of this folio, * then call folio_free_swap to free its swap space. * * Return: true if we were able to release the swap space. */ bool folio_free_swap(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (!folio_test_swapcache(folio)) return false; if (folio_test_writeback(folio)) return false; if (folio_swapped(folio)) return false; /* * Once hibernation has begun to create its image of memory, * there's a danger that one of the calls to folio_free_swap() * - most probably a call from __try_to_reclaim_swap() while * hibernation is allocating its own swap pages for the image, * but conceivably even a call from memory reclaim - will free * the swap from a folio which has already been recorded in the * image as a clean swapcache folio, and then reuse its swap for * another page of the image. On waking from hibernation, the * original folio might be freed under memory pressure, then * later read back in from swap, now with the wrong data. * * Hibernation suspends storage while it is writing the image * to disk so check that here. */ if (pm_suspended_storage()) return false; delete_from_swap_cache(folio); folio_set_dirty(folio); return true; } /* * Free the swap entry like above, but also try to * free the page cache entry if it is the last user. */ int free_swap_and_cache(swp_entry_t entry) { struct swap_info_struct *p; unsigned char count; if (non_swap_entry(entry)) return 1; p = get_swap_device(entry); if (p) { if (WARN_ON(data_race(!p->swap_map[swp_offset(entry)]))) { put_swap_device(p); return 0; } count = __swap_entry_free(p, entry); if (count == SWAP_HAS_CACHE && !swap_page_trans_huge_swapped(p, entry)) __try_to_reclaim_swap(p, swp_offset(entry), TTRS_UNMAPPED | TTRS_FULL); put_swap_device(p); } return p != NULL; } #ifdef CONFIG_HIBERNATION swp_entry_t get_swap_page_of_type(int type) { struct swap_info_struct *si = swap_type_to_swap_info(type); swp_entry_t entry = {0}; if (!si) goto fail; /* This is called for allocating swap entry, not cache */ spin_lock(&si->lock); if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry)) atomic_long_dec(&nr_swap_pages); spin_unlock(&si->lock); fail: return entry; } /* * Find the swap type that corresponds to given device (if any). * * @offset - number of the PAGE_SIZE-sized block of the device, starting * from 0, in which the swap header is expected to be located. * * This is needed for the suspend to disk (aka swsusp). */ int swap_type_of(dev_t device, sector_t offset) { int type; if (!device) return -1; spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { struct swap_info_struct *sis = swap_info[type]; if (!(sis->flags & SWP_WRITEOK)) continue; if (device == sis->bdev->bd_dev) { struct swap_extent *se = first_se(sis); if (se->start_block == offset) { spin_unlock(&swap_lock); return type; } } } spin_unlock(&swap_lock); return -ENODEV; } int find_first_swap(dev_t *device) { int type; spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { struct swap_info_struct *sis = swap_info[type]; if (!(sis->flags & SWP_WRITEOK)) continue; *device = sis->bdev->bd_dev; spin_unlock(&swap_lock); return type; } spin_unlock(&swap_lock); return -ENODEV; } /* * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev * corresponding to given index in swap_info (swap type). */ sector_t swapdev_block(int type, pgoff_t offset) { struct swap_info_struct *si = swap_type_to_swap_info(type); struct swap_extent *se; if (!si || !(si->flags & SWP_WRITEOK)) return 0; se = offset_to_swap_extent(si, offset); return se->start_block + (offset - se->start_page); } /* * Return either the total number of swap pages of given type, or the number * of free pages of that type (depending on @free) * * This is needed for software suspend */ unsigned int count_swap_pages(int type, int free) { unsigned int n = 0; spin_lock(&swap_lock); if ((unsigned int)type < nr_swapfiles) { struct swap_info_struct *sis = swap_info[type]; spin_lock(&sis->lock); if (sis->flags & SWP_WRITEOK) { n = sis->pages; if (free) n -= sis->inuse_pages; } spin_unlock(&sis->lock); } spin_unlock(&swap_lock); return n; } #endif /* CONFIG_HIBERNATION */ static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) { return pte_same(pte_swp_clear_flags(pte), swp_pte); } /* * No need to decide whether this PTE shares the swap entry with others, * just let do_wp_page work it out if a write is requested later - to * force COW, vm_page_prot omits write permission from any private vma. */ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct folio *folio) { struct page *page = folio_file_page(folio, swp_offset(entry)); struct page *swapcache; spinlock_t *ptl; pte_t *pte, new_pte; int ret = 1; swapcache = page; page = ksm_might_need_to_copy(page, vma, addr); if (unlikely(!page)) return -ENOMEM; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { ret = 0; goto out; } if (unlikely(!PageUptodate(page))) { pte_t pteval; dec_mm_counter(vma->vm_mm, MM_SWAPENTS); pteval = swp_entry_to_pte(make_swapin_error_entry(page)); set_pte_at(vma->vm_mm, addr, pte, pteval); swap_free(entry); ret = 0; goto out; } /* * Some architectures may have to restore extra metadata to the page * when reading from swap. This metadata may be indexed by swap entry * so this must be called before swap_free(). */ arch_swap_restore(entry, page_folio(page)); /* See do_swap_page() */ BUG_ON(!PageAnon(page) && PageMappedToDisk(page)); BUG_ON(PageAnon(page) && PageAnonExclusive(page)); dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); if (page == swapcache) { rmap_t rmap_flags = RMAP_NONE; /* * See do_swap_page(): PageWriteback() would be problematic. * However, we do a wait_on_page_writeback() just before this * call and have the page locked. */ VM_BUG_ON_PAGE(PageWriteback(page), page); if (pte_swp_exclusive(*pte)) rmap_flags |= RMAP_EXCLUSIVE; page_add_anon_rmap(page, vma, addr, rmap_flags); } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, addr); lru_cache_add_inactive_or_unevictable(page, vma); } new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); if (pte_swp_soft_dirty(*pte)) new_pte = pte_mksoft_dirty(new_pte); if (pte_swp_uffd_wp(*pte)) new_pte = pte_mkuffd_wp(new_pte); set_pte_at(vma->vm_mm, addr, pte, new_pte); swap_free(entry); out: pte_unmap_unlock(pte, ptl); if (page != swapcache) { unlock_page(page); put_page(page); } return ret; } static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned int type) { swp_entry_t entry; pte_t *pte; struct swap_info_struct *si; int ret = 0; volatile unsigned char *swap_map; si = swap_info[type]; pte = pte_offset_map(pmd, addr); do { struct folio *folio; unsigned long offset; if (!is_swap_pte(*pte)) continue; entry = pte_to_swp_entry(*pte); if (swp_type(entry) != type) continue; offset = swp_offset(entry); pte_unmap(pte); swap_map = &si->swap_map[offset]; folio = swap_cache_get_folio(entry, vma, addr); if (!folio) { struct page *page; struct vm_fault vmf = { .vma = vma, .address = addr, .real_address = addr, .pmd = pmd, }; page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); if (page) folio = page_folio(page); } if (!folio) { if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) goto try_next; return -ENOMEM; } folio_lock(folio); folio_wait_writeback(folio); ret = unuse_pte(vma, pmd, addr, entry, folio); if (ret < 0) { folio_unlock(folio); folio_put(folio); goto out; } folio_free_swap(folio); folio_unlock(folio); folio_put(folio); try_next: pte = pte_offset_map(pmd, addr); } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap(pte - 1); ret = 0; out: return ret; } static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, unsigned int type) { pmd_t *pmd; unsigned long next; int ret; pmd = pmd_offset(pud, addr); do { cond_resched(); next = pmd_addr_end(addr, end); if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; ret = unuse_pte_range(vma, pmd, addr, next, type); if (ret) return ret; } while (pmd++, addr = next, addr != end); return 0; } static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, unsigned int type) { pud_t *pud; unsigned long next; int ret; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; ret = unuse_pmd_range(vma, pud, addr, next, type); if (ret) return ret; } while (pud++, addr = next, addr != end); return 0; } static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, unsigned int type) { p4d_t *p4d; unsigned long next; int ret; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; ret = unuse_pud_range(vma, p4d, addr, next, type); if (ret) return ret; } while (p4d++, addr = next, addr != end); return 0; } static int unuse_vma(struct vm_area_struct *vma, unsigned int type) { pgd_t *pgd; unsigned long addr, end, next; int ret; addr = vma->vm_start; end = vma->vm_end; pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; ret = unuse_p4d_range(vma, pgd, addr, next, type); if (ret) return ret; } while (pgd++, addr = next, addr != end); return 0; } static int unuse_mm(struct mm_struct *mm, unsigned int type) { struct vm_area_struct *vma; int ret = 0; VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); for_each_vma(vmi, vma) { if (vma->anon_vma && !is_vm_hugetlb_page(vma)) { ret = unuse_vma(vma, type); if (ret) break; } cond_resched(); } mmap_read_unlock(mm); return ret; } /* * Scan swap_map from current position to next entry still in use. * Return 0 if there are no inuse entries after prev till end of * the map. */ static unsigned int find_next_to_unuse(struct swap_info_struct *si, unsigned int prev) { unsigned int i; unsigned char count; /* * No need for swap_lock here: we're just looking * for whether an entry is in use, not modifying it; false * hits are okay, and sys_swapoff() has already prevented new * allocations from this area (while holding swap_lock). */ for (i = prev + 1; i < si->max; i++) { count = READ_ONCE(si->swap_map[i]); if (count && swap_count(count) != SWAP_MAP_BAD) break; if ((i % LATENCY_LIMIT) == 0) cond_resched(); } if (i == si->max) i = 0; return i; } static int try_to_unuse(unsigned int type) { struct mm_struct *prev_mm; struct mm_struct *mm; struct list_head *p; int retval = 0; struct swap_info_struct *si = swap_info[type]; struct folio *folio; swp_entry_t entry; unsigned int i; if (!READ_ONCE(si->inuse_pages)) return 0; retry: retval = shmem_unuse(type); if (retval) return retval; prev_mm = &init_mm; mmget(prev_mm); spin_lock(&mmlist_lock); p = &init_mm.mmlist; while (READ_ONCE(si->inuse_pages) && !signal_pending(current) && (p = p->next) != &init_mm.mmlist) { mm = list_entry(p, struct mm_struct, mmlist); if (!mmget_not_zero(mm)) continue; spin_unlock(&mmlist_lock); mmput(prev_mm); prev_mm = mm; retval = unuse_mm(mm, type); if (retval) { mmput(prev_mm); return retval; } /* * Make sure that we aren't completely killing * interactive performance. */ cond_resched(); spin_lock(&mmlist_lock); } spin_unlock(&mmlist_lock); mmput(prev_mm); i = 0; while (READ_ONCE(si->inuse_pages) && !signal_pending(current) && (i = find_next_to_unuse(si, i)) != 0) { entry = swp_entry(type, i); folio = filemap_get_folio(swap_address_space(entry), i); if (!folio) continue; /* * It is conceivable that a racing task removed this folio from * swap cache just before we acquired the page lock. The folio * might even be back in swap cache on another swap area. But * that is okay, folio_free_swap() only removes stale folios. */ folio_lock(folio); folio_wait_writeback(folio); folio_free_swap(folio); folio_unlock(folio); folio_put(folio); } /* * Lets check again to see if there are still swap entries in the map. * If yes, we would need to do retry the unuse logic again. * Under global memory pressure, swap entries can be reinserted back * into process space after the mmlist loop above passes over them. * * Limit the number of retries? No: when mmget_not_zero() * above fails, that mm is likely to be freeing swap from * exit_mmap(), which proceeds at its own independent pace; * and even shmem_writepage() could have been preempted after * folio_alloc_swap(), temporarily hiding that swap. It's easy * and robust (though cpu-intensive) just to keep retrying. */ if (READ_ONCE(si->inuse_pages)) { if (!signal_pending(current)) goto retry; return -EINTR; } return 0; } /* * After a successful try_to_unuse, if no swap is now in use, we know * we can empty the mmlist. swap_lock must be held on entry and exit. * Note that mmlist_lock nests inside swap_lock, and an mm must be * added to the mmlist just after page_duplicate - before would be racy. */ static void drain_mmlist(void) { struct list_head *p, *next; unsigned int type; for (type = 0; type < nr_swapfiles; type++) if (swap_info[type]->inuse_pages) return; spin_lock(&mmlist_lock); list_for_each_safe(p, next, &init_mm.mmlist) list_del_init(p); spin_unlock(&mmlist_lock); } /* * Free all of a swapdev's extent information */ static void destroy_swap_extents(struct swap_info_struct *sis) { while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) { struct rb_node *rb = sis->swap_extent_root.rb_node; struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node); rb_erase(rb, &sis->swap_extent_root); kfree(se); } if (sis->flags & SWP_ACTIVATED) { struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; sis->flags &= ~SWP_ACTIVATED; if (mapping->a_ops->swap_deactivate) mapping->a_ops->swap_deactivate(swap_file); } } /* * Add a block range (and the corresponding page range) into this swapdev's * extent tree. * * This function rather assumes that it is called in ascending page order. */ int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block) { struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL; struct swap_extent *se; struct swap_extent *new_se; /* * place the new node at the right most since the * function is called in ascending page order. */ while (*link) { parent = *link; link = &parent->rb_right; } if (parent) { se = rb_entry(parent, struct swap_extent, rb_node); BUG_ON(se->start_page + se->nr_pages != start_page); if (se->start_block + se->nr_pages == start_block) { /* Merge it */ se->nr_pages += nr_pages; return 0; } } /* No merge, insert a new extent. */ new_se = kmalloc(sizeof(*se), GFP_KERNEL); if (new_se == NULL) return -ENOMEM; new_se->start_page = start_page; new_se->nr_pages = nr_pages; new_se->start_block = start_block; rb_link_node(&new_se->rb_node, parent, link); rb_insert_color(&new_se->rb_node, &sis->swap_extent_root); return 1; } EXPORT_SYMBOL_GPL(add_swap_extent); /* * A `swap extent' is a simple thing which maps a contiguous range of pages * onto a contiguous range of disk blocks. A rbtree of swap extents is * built at swapon time and is then used at swap_writepage/swap_readpage * time for locating where on disk a page belongs. * * If the swapfile is an S_ISBLK block device, a single extent is installed. * This is done so that the main operating code can treat S_ISBLK and S_ISREG * swap files identically. * * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap * extent rbtree operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK * swapfiles are handled *identically* after swapon time. * * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks * and will parse them into a rbtree, in PAGE_SIZE chunks. If some stray * blocks are found which do not fall within the PAGE_SIZE alignment * requirements, they are simply tossed out - we will never use those blocks * for swapping. * * For all swap devices we set S_SWAPFILE across the life of the swapon. This * prevents users from writing to the swap device, which will corrupt memory. * * The amount of disk space which a single swap extent represents varies. * Typically it is in the 1-4 megabyte range. So we can have hundreds of * extents in the rbtree. - akpm. */ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) { struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; int ret; if (S_ISBLK(inode->i_mode)) { ret = add_swap_extent(sis, 0, sis->max, 0); *span = sis->pages; return ret; } if (mapping->a_ops->swap_activate) { ret = mapping->a_ops->swap_activate(sis, swap_file, span); if (ret < 0) return ret; sis->flags |= SWP_ACTIVATED; if ((sis->flags & SWP_FS_OPS) && sio_pool_init() != 0) { destroy_swap_extents(sis); return -ENOMEM; } return ret; } return generic_swapfile_activate(sis, swap_file, span); } static int swap_node(struct swap_info_struct *p) { struct block_device *bdev; if (p->bdev) bdev = p->bdev; else bdev = p->swap_file->f_inode->i_sb->s_bdev; return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; } static void setup_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info) { int i; if (prio >= 0) p->prio = prio; else p->prio = --least_priority; /* * the plist prio is negated because plist ordering is * low-to-high, while swap ordering is high-to-low */ p->list.prio = -p->prio; for_each_node(i) { if (p->prio >= 0) p->avail_lists[i].prio = -p->prio; else { if (swap_node(p) == i) p->avail_lists[i].prio = 1; else p->avail_lists[i].prio = -p->prio; } } p->swap_map = swap_map; p->cluster_info = cluster_info; } static void _enable_swap_info(struct swap_info_struct *p) { p->flags |= SWP_WRITEOK; atomic_long_add(p->pages, &nr_swap_pages); total_swap_pages += p->pages; assert_spin_locked(&swap_lock); /* * both lists are plists, and thus priority ordered. * swap_active_head needs to be priority ordered for swapoff(), * which on removal of any swap_info_struct with an auto-assigned * (i.e. negative) priority increments the auto-assigned priority * of any lower-priority swap_info_structs. * swap_avail_head needs to be priority ordered for folio_alloc_swap(), * which allocates swap pages from the highest available priority * swap_info_struct. */ plist_add(&p->list, &swap_active_head); add_to_avail_list(p); } static void enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info, unsigned long *frontswap_map) { if (IS_ENABLED(CONFIG_FRONTSWAP)) frontswap_init(p->type, frontswap_map); spin_lock(&swap_lock); spin_lock(&p->lock); setup_swap_info(p, prio, swap_map, cluster_info); spin_unlock(&p->lock); spin_unlock(&swap_lock); /* * Finished initializing swap device, now it's safe to reference it. */ percpu_ref_resurrect(&p->users); spin_lock(&swap_lock); spin_lock(&p->lock); _enable_swap_info(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); } static void reinsert_swap_info(struct swap_info_struct *p) { spin_lock(&swap_lock); spin_lock(&p->lock); setup_swap_info(p, p->prio, p->swap_map, p->cluster_info); _enable_swap_info(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); } bool has_usable_swap(void) { bool ret = true; spin_lock(&swap_lock); if (plist_head_empty(&swap_active_head)) ret = false; spin_unlock(&swap_lock); return ret; } SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; unsigned char *swap_map; struct swap_cluster_info *cluster_info; unsigned long *frontswap_map; struct file *swap_file, *victim; struct address_space *mapping; struct inode *inode; struct filename *pathname; int err, found = 0; unsigned int old_block_size; bool hibernation_swap = false; if (!capable(CAP_SYS_ADMIN)) return -EPERM; BUG_ON(!current->mm); pathname = getname(specialfile); if (IS_ERR(pathname)) return PTR_ERR(pathname); victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); err = PTR_ERR(victim); if (IS_ERR(victim)) goto out; mapping = victim->f_mapping; spin_lock(&swap_lock); plist_for_each_entry(p, &swap_active_head, list) { if (p->flags & SWP_WRITEOK) { if (p->swap_file->f_mapping == mapping) { found = 1; break; } } } if (!found) { err = -EINVAL; spin_unlock(&swap_lock); goto out_dput; } if (!security_vm_enough_memory_mm(current->mm, p->pages)) vm_unacct_memory(p->pages); else { err = -ENOMEM; spin_unlock(&swap_lock); goto out_dput; } spin_lock(&p->lock); del_from_avail_list(p); if (p->prio < 0) { struct swap_info_struct *si = p; int nid; plist_for_each_entry_continue(si, &swap_active_head, list) { si->prio++; si->list.prio--; for_each_node(nid) { if (si->avail_lists[nid].prio != 1) si->avail_lists[nid].prio--; } } least_priority++; } plist_del(&p->list, &swap_active_head); atomic_long_sub(p->pages, &nr_swap_pages); total_swap_pages -= p->pages; p->flags &= ~SWP_WRITEOK; spin_unlock(&p->lock); spin_unlock(&swap_lock); disable_swap_slots_cache_lock(); set_current_oom_origin(); err = try_to_unuse(p->type); clear_current_oom_origin(); if (err) { /* re-insert swap space back into swap_list */ reinsert_swap_info(p); reenable_swap_slots_cache_unlock(); goto out_dput; } reenable_swap_slots_cache_unlock(); /* * Wait for swap operations protected by get/put_swap_device() * to complete. * * We need synchronize_rcu() here to protect the accessing to * the swap cache data structure. */ percpu_ref_kill(&p->users); synchronize_rcu(); wait_for_completion(&p->comp); flush_work(&p->discard_work); destroy_swap_extents(p); trace_android_vh_check_hibernation_swap(p->bdev, &hibernation_swap); if (p->flags & SWP_CONTINUED) free_swap_count_continuations(p); if (!p->bdev || hibernation_swap || !bdev_nonrot(p->bdev)) atomic_dec(&nr_rotate_swap); mutex_lock(&swapon_mutex); spin_lock(&swap_lock); spin_lock(&p->lock); drain_mmlist(); /* wait for anyone still in scan_swap_map_slots */ p->highest_bit = 0; /* cuts scans short */ while (p->flags >= SWP_SCANNING) { spin_unlock(&p->lock); spin_unlock(&swap_lock); schedule_timeout_uninterruptible(1); spin_lock(&swap_lock); spin_lock(&p->lock); } swap_file = p->swap_file; old_block_size = p->old_block_size; p->swap_file = NULL; p->max = 0; swap_map = p->swap_map; p->swap_map = NULL; cluster_info = p->cluster_info; p->cluster_info = NULL; frontswap_map = frontswap_map_get(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); arch_swap_invalidate_area(p->type); frontswap_invalidate_area(p->type); frontswap_map_set(p, NULL); mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; free_percpu(p->cluster_next_cpu); p->cluster_next_cpu = NULL; vfree(swap_map); kvfree(cluster_info); kvfree(frontswap_map); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); exit_swap_address_space(p->type); inode = mapping->host; if (S_ISBLK(inode->i_mode)) { struct block_device *bdev = I_BDEV(inode); set_blocksize(bdev, old_block_size); blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } inode_lock(inode); inode->i_flags &= ~S_SWAPFILE; inode_unlock(inode); filp_close(swap_file, NULL); /* * Clear the SWP_USED flag after all resources are freed so that swapon * can reuse this swap_info in alloc_swap_info() safely. It is ok to * not hold p->lock after we cleared its SWP_WRITEOK. */ spin_lock(&swap_lock); p->flags = 0; spin_unlock(&swap_lock); err = 0; atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); out_dput: filp_close(victim, NULL); out: putname(pathname); return err; } #ifdef CONFIG_PROC_FS static __poll_t swaps_poll(struct file *file, poll_table *wait) { struct seq_file *seq = file->private_data; poll_wait(file, &proc_poll_wait, wait); if (seq->poll_event != atomic_read(&proc_poll_event)) { seq->poll_event = atomic_read(&proc_poll_event); return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI; } return EPOLLIN | EPOLLRDNORM; } /* iterator */ static void *swap_start(struct seq_file *swap, loff_t *pos) { struct swap_info_struct *si; int type; loff_t l = *pos; mutex_lock(&swapon_mutex); if (!l) return SEQ_START_TOKEN; for (type = 0; (si = swap_type_to_swap_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; if (!--l) return si; } return NULL; } static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) { struct swap_info_struct *si = v; int type; if (v == SEQ_START_TOKEN) type = 0; else type = si->type + 1; ++(*pos); for (; (si = swap_type_to_swap_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; return si; } return NULL; } static void swap_stop(struct seq_file *swap, void *v) { mutex_unlock(&swapon_mutex); } static int swap_show(struct seq_file *swap, void *v) { struct swap_info_struct *si = v; struct file *file; int len; unsigned long bytes, inuse; if (si == SEQ_START_TOKEN) { seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); return 0; } bytes = si->pages << (PAGE_SHIFT - 10); inuse = READ_ONCE(si->inuse_pages) << (PAGE_SHIFT - 10); file = si->swap_file; len = seq_file_path(swap, file, " \t\n\\"); seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n", len < 40 ? 40 - len : 1, " ", S_ISBLK(file_inode(file)->i_mode) ? "partition" : "file\t", bytes, bytes < 10000000 ? "\t" : "", inuse, inuse < 10000000 ? "\t" : "", si->prio); return 0; } static const struct seq_operations swaps_op = { .start = swap_start, .next = swap_next, .stop = swap_stop, .show = swap_show }; static int swaps_open(struct inode *inode, struct file *file) { struct seq_file *seq; int ret; ret = seq_open(file, &swaps_op); if (ret) return ret; seq = file->private_data; seq->poll_event = atomic_read(&proc_poll_event); return 0; } static const struct proc_ops swaps_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = swaps_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release, .proc_poll = swaps_poll, }; static int __init procswaps_init(void) { proc_create("swaps", 0, NULL, &swaps_proc_ops); return 0; } __initcall(procswaps_init); #endif /* CONFIG_PROC_FS */ #ifdef MAX_SWAPFILES_CHECK static int __init max_swapfiles_check(void) { MAX_SWAPFILES_CHECK(); return 0; } late_initcall(max_swapfiles_check); #endif static struct swap_info_struct *alloc_swap_info(void) { struct swap_info_struct *p; struct swap_info_struct *defer = NULL; unsigned int type; int i; p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); if (percpu_ref_init(&p->users, swap_users_ref_free, PERCPU_REF_INIT_DEAD, GFP_KERNEL)) { kvfree(p); return ERR_PTR(-ENOMEM); } spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { if (!(swap_info[type]->flags & SWP_USED)) break; } if (type >= MAX_SWAPFILES) { spin_unlock(&swap_lock); percpu_ref_exit(&p->users); kvfree(p); return ERR_PTR(-EPERM); } if (type >= nr_swapfiles) { p->type = type; /* * Publish the swap_info_struct after initializing it. * Note that kvzalloc() above zeroes all its fields. */ smp_store_release(&swap_info[type], p); /* rcu_assign_pointer() */ nr_swapfiles++; } else { defer = p; p = swap_info[type]; /* * Do not memset this entry: a racing procfs swap_next() * would be relying on p->type to remain valid. */ } p->swap_extent_root = RB_ROOT; plist_node_init(&p->list, 0); for_each_node(i) plist_node_init(&p->avail_lists[i], 0); p->flags = SWP_USED; spin_unlock(&swap_lock); if (defer) { percpu_ref_exit(&defer->users); kvfree(defer); } spin_lock_init(&p->lock); spin_lock_init(&p->cont_lock); init_completion(&p->comp); return p; } static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) { int error; if (S_ISBLK(inode->i_mode)) { p->bdev = blkdev_get_by_dev(inode->i_rdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, p); if (IS_ERR(p->bdev)) { error = PTR_ERR(p->bdev); p->bdev = NULL; return error; } p->old_block_size = block_size(p->bdev); error = set_blocksize(p->bdev, PAGE_SIZE); if (error < 0) return error; /* * Zoned block devices contain zones that have a sequential * write only restriction. Hence zoned block devices are not * suitable for swapping. Disallow them here. */ if (bdev_is_zoned(p->bdev)) return -EINVAL; p->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { p->bdev = inode->i_sb->s_bdev; } return 0; } /* * Find out how many pages are allowed for a single swap device. There * are two limiting factors: * 1) the number of bits for the swap offset in the swp_entry_t type, and * 2) the number of bits in the swap pte, as defined by the different * architectures. * * In order to find the largest possible bit mask, a swap entry with * swap type 0 and swap offset ~0UL is created, encoded to a swap pte, * decoded to a swp_entry_t again, and finally the swap offset is * extracted. * * This will mask all the bits from the initial ~0UL mask that can't * be encoded in either the swp_entry_t or the architecture definition * of a swap pte. */ unsigned long generic_max_swapfile_size(void) { return swp_offset(pte_to_swp_entry( swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; } /* Can be overridden by an architecture for additional checks. */ __weak unsigned long arch_max_swapfile_size(void) { return generic_max_swapfile_size(); } static unsigned long read_swap_header(struct swap_info_struct *p, union swap_header *swap_header, struct inode *inode) { int i; unsigned long maxpages; unsigned long swapfilepages; unsigned long last_page; if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { pr_err("Unable to find swap-space signature\n"); return 0; } /* swap partition endianness hack... */ if (swab32(swap_header->info.version) == 1) { swab32s(&swap_header->info.version); swab32s(&swap_header->info.last_page); swab32s(&swap_header->info.nr_badpages); if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) return 0; for (i = 0; i < swap_header->info.nr_badpages; i++) swab32s(&swap_header->info.badpages[i]); } /* Check the swap header's sub-version */ if (swap_header->info.version != 1) { pr_warn("Unable to handle swap header version %d\n", swap_header->info.version); return 0; } p->lowest_bit = 1; p->cluster_next = 1; p->cluster_nr = 0; maxpages = swapfile_maximum_size; last_page = swap_header->info.last_page; if (!last_page) { pr_warn("Empty swap-file\n"); return 0; } if (last_page > maxpages) { pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", maxpages << (PAGE_SHIFT - 10), last_page << (PAGE_SHIFT - 10)); } if (maxpages > last_page) { maxpages = last_page + 1; /* p->max is an unsigned int: don't overflow it */ if ((unsigned int)maxpages == 0) maxpages = UINT_MAX; } p->highest_bit = maxpages - 1; if (!maxpages) return 0; swapfilepages = i_size_read(inode) >> PAGE_SHIFT; if (swapfilepages && maxpages > swapfilepages) { pr_warn("Swap area shorter than signature indicates\n"); return 0; } if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) return 0; if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) return 0; return maxpages; } #define SWAP_CLUSTER_INFO_COLS \ DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info)) #define SWAP_CLUSTER_SPACE_COLS \ DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER) #define SWAP_CLUSTER_COLS \ max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS) static int setup_swap_map_and_extents(struct swap_info_struct *p, union swap_header *swap_header, unsigned char *swap_map, struct swap_cluster_info *cluster_info, unsigned long maxpages, sector_t *span) { unsigned int j, k; unsigned int nr_good_pages; int nr_extents; unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS; unsigned long i, idx; nr_good_pages = maxpages - 1; /* omit header page */ cluster_list_init(&p->free_clusters); cluster_list_init(&p->discard_clusters); for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; if (page_nr == 0 || page_nr > swap_header->info.last_page) return -EINVAL; if (page_nr < maxpages) { swap_map[page_nr] = SWAP_MAP_BAD; nr_good_pages--; /* * Haven't marked the cluster free yet, no list * operation involved */ inc_cluster_info_page(p, cluster_info, page_nr); } } /* Haven't marked the cluster free yet, no list operation involved */ for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) inc_cluster_info_page(p, cluster_info, i); if (nr_good_pages) { swap_map[0] = SWAP_MAP_BAD; /* * Not mark the cluster free yet, no list * operation involved */ inc_cluster_info_page(p, cluster_info, 0); p->max = maxpages; p->pages = nr_good_pages; nr_extents = setup_swap_extents(p, span); if (nr_extents < 0) return nr_extents; nr_good_pages = p->pages; } if (!nr_good_pages) { pr_warn("Empty swap-file\n"); return -EINVAL; } if (!cluster_info) return nr_extents; /* * Reduce false cache line sharing between cluster_info and * sharing same address space. */ for (k = 0; k < SWAP_CLUSTER_COLS; k++) { j = (k + col) % SWAP_CLUSTER_COLS; for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { idx = i * SWAP_CLUSTER_COLS + j; if (idx >= nr_clusters) continue; if (cluster_count(&cluster_info[idx])) continue; cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); cluster_list_add_tail(&p->free_clusters, cluster_info, idx); } } return nr_extents; } SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct *p; struct filename *name; struct file *swap_file = NULL; struct address_space *mapping; struct dentry *dentry; int prio; int error; union swap_header *swap_header; int nr_extents; sector_t span; unsigned long maxpages; unsigned char *swap_map = NULL; struct swap_cluster_info *cluster_info = NULL; unsigned long *frontswap_map = NULL; struct page *page = NULL; struct inode *inode = NULL; bool inced_nr_rotate_swap = false; bool hibernation_swap = false; if (swap_flags & ~SWAP_FLAGS_VALID) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!swap_avail_heads) return -ENOMEM; p = alloc_swap_info(); if (IS_ERR(p)) return PTR_ERR(p); INIT_WORK(&p->discard_work, swap_discard_work); name = getname(specialfile); if (IS_ERR(name)) { error = PTR_ERR(name); name = NULL; goto bad_swap; } swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0); if (IS_ERR(swap_file)) { error = PTR_ERR(swap_file); swap_file = NULL; goto bad_swap; } p->swap_file = swap_file; mapping = swap_file->f_mapping; dentry = swap_file->f_path.dentry; inode = mapping->host; error = claim_swapfile(p, inode); if (unlikely(error)) goto bad_swap; inode_lock(inode); if (d_unlinked(dentry) || cant_mount(dentry)) { error = -ENOENT; goto bad_swap_unlock_inode; } if (IS_SWAPFILE(inode)) { error = -EBUSY; goto bad_swap_unlock_inode; } /* * Read the swap header. */ if (!mapping->a_ops->read_folio) { error = -EINVAL; goto bad_swap_unlock_inode; } page = read_mapping_page(mapping, 0, swap_file); if (IS_ERR(page)) { error = PTR_ERR(page); goto bad_swap_unlock_inode; } swap_header = kmap(page); maxpages = read_swap_header(p, swap_header, inode); if (unlikely(!maxpages)) { error = -EINVAL; goto bad_swap_unlock_inode; } /* OK, set up the swap map and apply the bad block list */ swap_map = vzalloc(maxpages); if (!swap_map) { error = -ENOMEM; goto bad_swap_unlock_inode; } trace_android_vh_check_hibernation_swap(p->bdev, &hibernation_swap); if (p->bdev && bdev_stable_writes(p->bdev)) p->flags |= SWP_STABLE_WRITES; if (p->bdev && p->bdev->bd_disk->fops->rw_page) p->flags |= SWP_SYNCHRONOUS_IO; if (p->bdev && !hibernation_swap && bdev_nonrot(p->bdev)) { int cpu; unsigned long ci, nr_cluster; p->flags |= SWP_SOLIDSTATE; p->cluster_next_cpu = alloc_percpu(unsigned int); if (!p->cluster_next_cpu) { error = -ENOMEM; goto bad_swap_unlock_inode; } /* * select a random position to start with to help wear leveling * SSD */ for_each_possible_cpu(cpu) { per_cpu(*p->cluster_next_cpu, cpu) = 1 + prandom_u32_max(p->highest_bit); } nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info), GFP_KERNEL); if (!cluster_info) { error = -ENOMEM; goto bad_swap_unlock_inode; } for (ci = 0; ci < nr_cluster; ci++) spin_lock_init(&((cluster_info + ci)->lock)); p->percpu_cluster = alloc_percpu(struct percpu_cluster); if (!p->percpu_cluster) { error = -ENOMEM; goto bad_swap_unlock_inode; } for_each_possible_cpu(cpu) { struct percpu_cluster *cluster; cluster = per_cpu_ptr(p->percpu_cluster, cpu); cluster_set_null(&cluster->index); } } else { atomic_inc(&nr_rotate_swap); inced_nr_rotate_swap = true; } error = swap_cgroup_swapon(p->type, maxpages); if (error) goto bad_swap_unlock_inode; nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, cluster_info, maxpages, &span); if (unlikely(nr_extents < 0)) { error = nr_extents; goto bad_swap_unlock_inode; } /* frontswap enabled? set up bit-per-page map for frontswap */ if (IS_ENABLED(CONFIG_FRONTSWAP)) frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages), sizeof(long), GFP_KERNEL); if ((swap_flags & SWAP_FLAG_DISCARD) && p->bdev && bdev_max_discard_sectors(p->bdev)) { /* * When discard is enabled for swap with no particular * policy flagged, we set all swap discard flags here in * order to sustain backward compatibility with older * swapon(8) releases. */ p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | SWP_PAGE_DISCARD); /* * By flagging sys_swapon, a sysadmin can tell us to * either do single-time area discards only, or to just * perform discards for released swap page-clusters. * Now it's time to adjust the p->flags accordingly. */ if (swap_flags & SWAP_FLAG_DISCARD_ONCE) p->flags &= ~SWP_PAGE_DISCARD; else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) p->flags &= ~SWP_AREA_DISCARD; /* issue a swapon-time discard if it's still required */ if (p->flags & SWP_AREA_DISCARD) { int err = discard_swap(p); if (unlikely(err)) pr_err("swapon: discard_swap(%p): %d\n", p, err); } } error = init_swap_address_space(p->type, maxpages); if (error) goto bad_swap_unlock_inode; /* * Flush any pending IO and dirty mappings before we start using this * swap device. */ inode->i_flags |= S_SWAPFILE; error = inode_drain_writes(inode); if (error) { inode->i_flags &= ~S_SWAPFILE; goto free_swap_address_space; } mutex_lock(&swapon_mutex); prio = -1; if (swap_flags & SWAP_FLAG_PREFER) prio = (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", p->pages<<(PAGE_SHIFT-10), name->name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", (p->flags & SWP_DISCARDABLE) ? "D" : "", (p->flags & SWP_AREA_DISCARD) ? "s" : "", (p->flags & SWP_PAGE_DISCARD) ? "c" : "", (frontswap_map) ? "FS" : ""); mutex_unlock(&swapon_mutex); atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); error = 0; goto out; free_swap_address_space: exit_swap_address_space(p->type); bad_swap_unlock_inode: inode_unlock(inode); bad_swap: free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; free_percpu(p->cluster_next_cpu); p->cluster_next_cpu = NULL; if (inode && S_ISBLK(inode->i_mode) && p->bdev) { set_blocksize(p->bdev, p->old_block_size); blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } inode = NULL; destroy_swap_extents(p); swap_cgroup_swapoff(p->type); spin_lock(&swap_lock); p->swap_file = NULL; p->flags = 0; spin_unlock(&swap_lock); vfree(swap_map); kvfree(cluster_info); kvfree(frontswap_map); if (inced_nr_rotate_swap) atomic_dec(&nr_rotate_swap); if (swap_file) filp_close(swap_file, NULL); out: if (page && !IS_ERR(page)) { kunmap(page); put_page(page); } if (name) putname(name); if (inode) inode_unlock(inode); if (!error) enable_swap_slots_cache(); return error; } void si_swapinfo(struct sysinfo *val) { unsigned int type; unsigned long nr_to_be_unused = 0; spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { struct swap_info_struct *si = swap_info[type]; if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) nr_to_be_unused += READ_ONCE(si->inuse_pages); } val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; val->totalswap = total_swap_pages + nr_to_be_unused; spin_unlock(&swap_lock); } EXPORT_SYMBOL_NS_GPL(si_swapinfo, MINIDUMP); /* * Verify that a swap entry is valid and increment its swap map count. * * Returns error code in following case. * - success -> 0 * - swp_entry is invalid -> EINVAL * - swp_entry is migration entry -> EINVAL * - swap-cache reference is requested but there is already one. -> EEXIST * - swap-cache reference is requested but the entry is not used. -> ENOENT * - swap-mapped reference requested but needs continued swap count. -> ENOMEM */ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) { struct swap_info_struct *p; struct swap_cluster_info *ci; unsigned long offset; unsigned char count; unsigned char has_cache; int err; p = get_swap_device(entry); if (!p) return -EINVAL; offset = swp_offset(entry); ci = lock_cluster_or_swap_info(p, offset); count = p->swap_map[offset]; /* * swapin_readahead() doesn't check if a swap entry is valid, so the * swap entry could be SWAP_MAP_BAD. Check here with lock held. */ if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { err = -ENOENT; goto unlock_out; } has_cache = count & SWAP_HAS_CACHE; count &= ~SWAP_HAS_CACHE; err = 0; if (usage == SWAP_HAS_CACHE) { /* set SWAP_HAS_CACHE if there is no cache and entry is used */ if (!has_cache && count) has_cache = SWAP_HAS_CACHE; else if (has_cache) /* someone else added cache */ err = -EEXIST; else /* no users remaining */ err = -ENOENT; } else if (count || has_cache) { if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) count += usage; else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) err = -EINVAL; else if (swap_count_continued(p, offset, count)) count = COUNT_CONTINUED; else err = -ENOMEM; } else err = -ENOENT; /* unused swap entry */ WRITE_ONCE(p->swap_map[offset], count | has_cache); unlock_out: unlock_cluster_or_swap_info(p, ci); put_swap_device(p); return err; } /* * Help swapoff by noting that swap entry belongs to shmem/tmpfs * (in which case its reference count is never incremented). */ void swap_shmem_alloc(swp_entry_t entry) { __swap_duplicate(entry, SWAP_MAP_SHMEM); } /* * Increase reference count of swap entry by 1. * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required * but could not be atomically allocated. Returns 0, just as if it succeeded, * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which * might occur if a page table entry has got corrupted. */ int swap_duplicate(swp_entry_t entry) { int err = 0; while (!err && __swap_duplicate(entry, 1) == -ENOMEM) err = add_swap_count_continuation(entry, GFP_ATOMIC); return err; } /* * @entry: swap entry for which we allocate swap cache. * * Called when allocating swap cache for existing swap entry, * This can return error codes. Returns 0 at success. * -EEXIST means there is a swap cache. * Note: return code is different from swap_duplicate(). */ int swapcache_prepare(swp_entry_t entry) { return __swap_duplicate(entry, SWAP_HAS_CACHE); } void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry) { struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); unsigned char usage; ci = lock_cluster_or_swap_info(si, offset); usage = __swap_entry_free_locked(si, offset, SWAP_HAS_CACHE); unlock_cluster_or_swap_info(si, ci); if (!usage) free_swap_slot(entry); } struct swap_info_struct *swp_swap_info(swp_entry_t entry) { return swap_type_to_swap_info(swp_type(entry)); } struct swap_info_struct *page_swap_info(struct page *page) { swp_entry_t entry = { .val = page_private(page) }; return swp_swap_info(entry); } /* * out-of-line methods to avoid include hell. */ struct address_space *swapcache_mapping(struct folio *folio) { return page_swap_info(&folio->page)->swap_file->f_mapping; } EXPORT_SYMBOL_GPL(swapcache_mapping); pgoff_t __page_file_index(struct page *page) { swp_entry_t swap = { .val = page_private(page) }; return swp_offset(swap); } EXPORT_SYMBOL_GPL(__page_file_index); /* * add_swap_count_continuation - called when a swap count is duplicated * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's * page of the original vmalloc'ed swap_map, to hold the continuation count * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. * * These continuation pages are seldom referenced: the common paths all work * on the original swap_map, only referring to a continuation page when the * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. * * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) * can be called after dropping locks. */ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) { struct swap_info_struct *si; struct swap_cluster_info *ci; struct page *head; struct page *page; struct page *list_page; pgoff_t offset; unsigned char count; int ret = 0; /* * When debugging, it's easier to use __GFP_ZERO here; but it's better * for latency not to zero a page while GFP_ATOMIC and holding locks. */ page = alloc_page(gfp_mask | __GFP_HIGHMEM); si = get_swap_device(entry); if (!si) { /* * An acceptable race has occurred since the failing * __swap_duplicate(): the swap device may be swapoff */ goto outer; } spin_lock(&si->lock); offset = swp_offset(entry); ci = lock_cluster(si, offset); count = swap_count(si->swap_map[offset]); if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { /* * The higher the swap count, the more likely it is that tasks * will race to add swap count continuation: we need to avoid * over-provisioning. */ goto out; } if (!page) { ret = -ENOMEM; goto out; } /* * We are fortunate that although vmalloc_to_page uses pte_offset_map, * no architecture is using highmem pages for kernel page tables: so it * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps. */ head = vmalloc_to_page(si->swap_map + offset); offset &= ~PAGE_MASK; spin_lock(&si->cont_lock); /* * Page allocation does not initialize the page's lru field, * but it does always reset its private field. */ if (!page_private(head)) { BUG_ON(count & COUNT_CONTINUED); INIT_LIST_HEAD(&head->lru); set_page_private(head, SWP_CONTINUED); si->flags |= SWP_CONTINUED; } list_for_each_entry(list_page, &head->lru, lru) { unsigned char *map; /* * If the previous map said no continuation, but we've found * a continuation page, free our allocation and use this one. */ if (!(count & COUNT_CONTINUED)) goto out_unlock_cont; map = kmap_atomic(list_page) + offset; count = *map; kunmap_atomic(map); /* * If this continuation count now has some space in it, * free our allocation and use this one. */ if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) goto out_unlock_cont; } list_add_tail(&page->lru, &head->lru); page = NULL; /* now it's attached, don't free it */ out_unlock_cont: spin_unlock(&si->cont_lock); out: unlock_cluster(ci); spin_unlock(&si->lock); put_swap_device(si); outer: if (page) __free_page(page); return ret; } /* * swap_count_continued - when the original swap_map count is incremented * from SWAP_MAP_MAX, check if there is already a continuation page to carry * into, carry if so, or else fail until a new continuation page is allocated; * when the original swap_map count is decremented from 0 with continuation, * borrow from the continuation and report whether it still holds more. * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster * lock. */ static bool swap_count_continued(struct swap_info_struct *si, pgoff_t offset, unsigned char count) { struct page *head; struct page *page; unsigned char *map; bool ret; head = vmalloc_to_page(si->swap_map + offset); if (page_private(head) != SWP_CONTINUED) { BUG_ON(count & COUNT_CONTINUED); return false; /* need to add count continuation */ } spin_lock(&si->cont_lock); offset &= ~PAGE_MASK; page = list_next_entry(head, lru); map = kmap_atomic(page) + offset; if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ goto init_map; /* jump over SWAP_CONT_MAX checks */ if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ /* * Think of how you add 1 to 999 */ while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { kunmap_atomic(map); page = list_next_entry(page, lru); BUG_ON(page == head); map = kmap_atomic(page) + offset; } if (*map == SWAP_CONT_MAX) { kunmap_atomic(map); page = list_next_entry(page, lru); if (page == head) { ret = false; /* add count continuation */ goto out; } map = kmap_atomic(page) + offset; init_map: *map = 0; /* we didn't zero the page */ } *map += 1; kunmap_atomic(map); while ((page = list_prev_entry(page, lru)) != head) { map = kmap_atomic(page) + offset; *map = COUNT_CONTINUED; kunmap_atomic(map); } ret = true; /* incremented */ } else { /* decrementing */ /* * Think of how you subtract 1 from 1000 */ BUG_ON(count != COUNT_CONTINUED); while (*map == COUNT_CONTINUED) { kunmap_atomic(map); page = list_next_entry(page, lru); BUG_ON(page == head); map = kmap_atomic(page) + offset; } BUG_ON(*map == 0); *map -= 1; if (*map == 0) count = 0; kunmap_atomic(map); while ((page = list_prev_entry(page, lru)) != head) { map = kmap_atomic(page) + offset; *map = SWAP_CONT_MAX | count; count = COUNT_CONTINUED; kunmap_atomic(map); } ret = count == COUNT_CONTINUED; } out: spin_unlock(&si->cont_lock); return ret; } /* * free_swap_count_continuations - swapoff free all the continuation pages * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. */ static void free_swap_count_continuations(struct swap_info_struct *si) { pgoff_t offset; for (offset = 0; offset < si->max; offset += PAGE_SIZE) { struct page *head; head = vmalloc_to_page(si->swap_map + offset); if (page_private(head)) { struct page *page, *next; list_for_each_entry_safe(page, next, &head->lru, lru) { list_del(&page->lru); __free_page(page); } } } } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) { struct swap_info_struct *si, *next; int nid = page_to_nid(page); if (!(gfp_mask & __GFP_IO)) return; if (!blk_cgroup_congested()) return; /* * We've already scheduled a throttle, avoid taking the global swap * lock. */ if (current->throttle_queue) return; spin_lock(&swap_avail_lock); plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], avail_lists[nid]) { if (si->bdev) { blkcg_schedule_throttle(si->bdev->bd_disk, true); break; } } spin_unlock(&swap_avail_lock); } #endif static int __init swapfile_init(void) { int nid; swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head), GFP_KERNEL); if (!swap_avail_heads) { pr_emerg("Not enough memory for swap heads, swap is disabled\n"); return -ENOMEM; } for_each_node(nid) plist_head_init(&swap_avail_heads[nid]); swapfile_maximum_size = arch_max_swapfile_size(); #ifdef CONFIG_MIGRATION if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS)) swap_migration_ad_supported = true; #endif /* CONFIG_MIGRATION */ return 0; } subsys_initcall(swapfile_init); |
7 7 7 7 2 7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2010 Werner Fink, Jiri Slaby */ #include <linux/console.h> #include <linux/kernel.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/tty_driver.h> /* * This is handler for /proc/consoles */ static int show_console_dev(struct seq_file *m, void *v) { static const struct { short flag; char name; } con_flags[] = { { CON_ENABLED, 'E' }, { CON_CONSDEV, 'C' }, { CON_BOOT, 'B' }, { CON_PRINTBUFFER, 'p' }, { CON_BRL, 'b' }, { CON_ANYTIME, 'a' }, }; char flags[ARRAY_SIZE(con_flags) + 1]; struct console *con = v; unsigned int a; dev_t dev = 0; if (con->device) { const struct tty_driver *driver; int index; driver = con->device(con, &index); if (driver) { dev = MKDEV(driver->major, driver->minor_start); dev += index; } } for (a = 0; a < ARRAY_SIZE(con_flags); a++) flags[a] = (con->flags & con_flags[a].flag) ? con_flags[a].name : ' '; flags[a] = 0; seq_setwidth(m, 21 - 1); seq_printf(m, "%s%d", con->name, con->index); seq_pad(m, ' '); seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-', con->write ? 'W' : '-', con->unblank ? 'U' : '-', flags); if (dev) seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev)); seq_putc(m, '\n'); return 0; } static void *c_start(struct seq_file *m, loff_t *pos) { struct console *con; loff_t off = 0; console_lock(); for_each_console(con) if (off++ == *pos) break; return con; } static void *c_next(struct seq_file *m, void *v, loff_t *pos) { struct console *con = v; ++*pos; return con->next; } static void c_stop(struct seq_file *m, void *v) { console_unlock(); } static const struct seq_operations consoles_op = { .start = c_start, .next = c_next, .stop = c_stop, .show = show_console_dev }; static int __init proc_consoles_init(void) { proc_create_seq("consoles", 0, NULL, &consoles_op); return 0; } fs_initcall(proc_consoles_init); |
50 54 92 83 45 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Cryptographic API. * * Null algorithms, aka Much Ado About Nothing. * * These are needed for IPsec, and may be useful in general for * testing & debugging. * * The null cipher is compliant with RFC2410. * * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> */ #include <crypto/null.h> #include <crypto/internal/hash.h> #include <crypto/internal/skcipher.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/string.h> static DEFINE_MUTEX(crypto_default_null_skcipher_lock); static struct crypto_sync_skcipher *crypto_default_null_skcipher; static int crypto_default_null_skcipher_refcnt; static int null_compress(struct crypto_tfm *tfm, const u8 *src, unsigned int slen, u8 *dst, unsigned int *dlen) { if (slen > *dlen) return -EINVAL; memcpy(dst, src, slen); *dlen = slen; return 0; } static int null_init(struct shash_desc *desc) { return 0; } static int null_update(struct shash_desc *desc, const u8 *data, unsigned int len) { return 0; } static int null_final(struct shash_desc *desc, u8 *out) { return 0; } static int null_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { return 0; } static int null_hash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { return 0; } static int null_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { return 0; } static int null_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen) { return 0; } static void null_crypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { memcpy(dst, src, NULL_BLOCK_SIZE); } static int null_skcipher_crypt(struct skcipher_request *req) { struct skcipher_walk walk; int err; err = skcipher_walk_virt(&walk, req, false); while (walk.nbytes) { if (walk.src.virt.addr != walk.dst.virt.addr) memcpy(walk.dst.virt.addr, walk.src.virt.addr, walk.nbytes); err = skcipher_walk_done(&walk, 0); } return err; } static struct shash_alg digest_null = { .digestsize = NULL_DIGEST_SIZE, .setkey = null_hash_setkey, .init = null_init, .update = null_update, .finup = null_digest, .digest = null_digest, .final = null_final, .base = { .cra_name = "digest_null", .cra_driver_name = "digest_null-generic", .cra_blocksize = NULL_BLOCK_SIZE, .cra_module = THIS_MODULE, } }; static struct skcipher_alg skcipher_null = { .base.cra_name = "ecb(cipher_null)", .base.cra_driver_name = "ecb-cipher_null", .base.cra_priority = 100, .base.cra_blocksize = NULL_BLOCK_SIZE, .base.cra_ctxsize = 0, .base.cra_module = THIS_MODULE, .min_keysize = NULL_KEY_SIZE, .max_keysize = NULL_KEY_SIZE, .ivsize = NULL_IV_SIZE, .setkey = null_skcipher_setkey, .encrypt = null_skcipher_crypt, .decrypt = null_skcipher_crypt, }; static struct crypto_alg null_algs[] = { { .cra_name = "cipher_null", .cra_driver_name = "cipher_null-generic", .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = NULL_BLOCK_SIZE, .cra_ctxsize = 0, .cra_module = THIS_MODULE, .cra_u = { .cipher = { .cia_min_keysize = NULL_KEY_SIZE, .cia_max_keysize = NULL_KEY_SIZE, .cia_setkey = null_setkey, .cia_encrypt = null_crypt, .cia_decrypt = null_crypt } } }, { .cra_name = "compress_null", .cra_driver_name = "compress_null-generic", .cra_flags = CRYPTO_ALG_TYPE_COMPRESS, .cra_blocksize = NULL_BLOCK_SIZE, .cra_ctxsize = 0, .cra_module = THIS_MODULE, .cra_u = { .compress = { .coa_compress = null_compress, .coa_decompress = null_compress } } } }; MODULE_ALIAS_CRYPTO("compress_null"); MODULE_ALIAS_CRYPTO("digest_null"); MODULE_ALIAS_CRYPTO("cipher_null"); struct crypto_sync_skcipher *crypto_get_default_null_skcipher(void) { struct crypto_sync_skcipher *tfm; mutex_lock(&crypto_default_null_skcipher_lock); tfm = crypto_default_null_skcipher; if (!tfm) { tfm = crypto_alloc_sync_skcipher("ecb(cipher_null)", 0, 0); if (IS_ERR(tfm)) goto unlock; crypto_default_null_skcipher = tfm; } crypto_default_null_skcipher_refcnt++; unlock: mutex_unlock(&crypto_default_null_skcipher_lock); return tfm; } EXPORT_SYMBOL_GPL(crypto_get_default_null_skcipher); void crypto_put_default_null_skcipher(void) { mutex_lock(&crypto_default_null_skcipher_lock); if (!--crypto_default_null_skcipher_refcnt) { crypto_free_sync_skcipher(crypto_default_null_skcipher); crypto_default_null_skcipher = NULL; } mutex_unlock(&crypto_default_null_skcipher_lock); } EXPORT_SYMBOL_GPL(crypto_put_default_null_skcipher); static int __init crypto_null_mod_init(void) { int ret = 0; ret = crypto_register_algs(null_algs, ARRAY_SIZE(null_algs)); if (ret < 0) goto out; ret = crypto_register_shash(&digest_null); if (ret < 0) goto out_unregister_algs; ret = crypto_register_skcipher(&skcipher_null); if (ret < 0) goto out_unregister_shash; return 0; out_unregister_shash: crypto_unregister_shash(&digest_null); out_unregister_algs: crypto_unregister_algs(null_algs, ARRAY_SIZE(null_algs)); out: return ret; } static void __exit crypto_null_mod_fini(void) { crypto_unregister_algs(null_algs, ARRAY_SIZE(null_algs)); crypto_unregister_shash(&digest_null); crypto_unregister_skcipher(&skcipher_null); } subsys_initcall(crypto_null_mod_init); module_exit(crypto_null_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Null Cryptographic Algorithms"); |
2712 29800 163 1078 2 8 180 186 180 178 129 2 127 129 737 734 328 320 378 377 2 6125 12 92 118 29 64 15 18 45 4 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Linux Socket Filter Data Structures */ #ifndef __LINUX_FILTER_H__ #define __LINUX_FILTER_H__ #include <linux/atomic.h> #include <linux/bpf.h> #include <linux/refcount.h> #include <linux/compat.h> #include <linux/skbuff.h> #include <linux/linkage.h> #include <linux/printk.h> #include <linux/workqueue.h> #include <linux/sched.h> #include <linux/capability.h> #include <linux/set_memory.h> #include <linux/kallsyms.h> #include <linux/if_vlan.h> #include <linux/vmalloc.h> #include <linux/sockptr.h> #include <crypto/sha1.h> #include <linux/u64_stats_sync.h> #include <net/sch_generic.h> #include <asm/byteorder.h> #include <uapi/linux/filter.h> struct sk_buff; struct sock; struct seccomp_data; struct bpf_prog_aux; struct xdp_rxq_info; struct xdp_buff; struct sock_reuseport; struct ctl_table; struct ctl_table_header; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function * calls in BPF_CALL instruction. */ #define BPF_REG_ARG1 BPF_REG_1 #define BPF_REG_ARG2 BPF_REG_2 #define BPF_REG_ARG3 BPF_REG_3 #define BPF_REG_ARG4 BPF_REG_4 #define BPF_REG_ARG5 BPF_REG_5 #define BPF_REG_CTX BPF_REG_6 #define BPF_REG_FP BPF_REG_10 /* Additional register mappings for converted user programs. */ #define BPF_REG_A BPF_REG_0 #define BPF_REG_X BPF_REG_7 #define BPF_REG_TMP BPF_REG_2 /* scratch reg */ #define BPF_REG_D BPF_REG_8 /* data, callee-saved */ #define BPF_REG_H BPF_REG_9 /* hlen, callee-saved */ /* Kernel hidden auxiliary/helper register. */ #define BPF_REG_AX MAX_BPF_REG #define MAX_BPF_EXT_REG (MAX_BPF_REG + 1) #define MAX_BPF_JIT_REG MAX_BPF_EXT_REG /* unused opcode to mark special call to bpf_tail_call() helper */ #define BPF_TAIL_CALL 0xf0 /* unused opcode to mark special load instruction. Same as BPF_ABS */ #define BPF_PROBE_MEM 0x20 /* unused opcode to mark call to interpreter with arguments */ #define BPF_CALL_ARGS 0xe0 /* unused opcode to mark speculation barrier for mitigating * Speculative Store Bypass */ #define BPF_NOSPEC 0xc0 /* As per nm, we expose JITed images as text (code) section for * kallsyms. That way, tools like perf can find it to match * addresses. */ #define BPF_SYM_ELF_TYPE 't' /* BPF program can access up to 512 bytes of stack space. */ #define MAX_BPF_STACK 512 /* Helper macros for filter block array initializers. */ /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ #define BPF_ALU64_REG(OP, DST, SRC) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = 0 }) #define BPF_ALU32_REG(OP, DST, SRC) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_OP(OP) | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = 0 }) /* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */ #define BPF_ALU64_IMM(OP, DST, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) #define BPF_ALU32_IMM(OP, DST, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) /* Endianess conversion, cpu_to_{l,b}e(), {l,b}e_to_cpu() */ #define BPF_ENDIAN(TYPE, DST, LEN) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_END | BPF_SRC(TYPE), \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = LEN }) /* Short form of mov, dst_reg = src_reg */ #define BPF_MOV64_REG(DST, SRC) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = 0 }) #define BPF_MOV32_REG(DST, SRC) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = 0 }) /* Short form of mov, dst_reg = imm32 */ #define BPF_MOV64_IMM(DST, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) #define BPF_MOV32_IMM(DST, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_MOV | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) /* Special form of mov32, used for doing explicit zero extension on dst. */ #define BPF_ZEXT_REG(DST) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = DST, \ .off = 0, \ .imm = 1 }) static inline bool insn_is_zext(const struct bpf_insn *insn) { return insn->code == (BPF_ALU | BPF_MOV | BPF_X) && insn->imm == 1; } /* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ #define BPF_LD_IMM64(DST, IMM) \ BPF_LD_IMM64_RAW(DST, 0, IMM) #define BPF_LD_IMM64_RAW(DST, SRC, IMM) \ ((struct bpf_insn) { \ .code = BPF_LD | BPF_DW | BPF_IMM, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = (__u32) (IMM) }), \ ((struct bpf_insn) { \ .code = 0, /* zero is reserved opcode */ \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = ((__u64) (IMM)) >> 32 }) /* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */ #define BPF_LD_MAP_FD(DST, MAP_FD) \ BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD) /* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */ #define BPF_MOV64_RAW(TYPE, DST, SRC, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_SRC(TYPE), \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = IMM }) #define BPF_MOV32_RAW(TYPE, DST, SRC, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_MOV | BPF_SRC(TYPE), \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = IMM }) /* Direct packet access, R0 = *(uint *) (skb->data + imm32) */ #define BPF_LD_ABS(SIZE, IMM) \ ((struct bpf_insn) { \ .code = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) /* Indirect packet access, R0 = *(uint *) (skb->data + src_reg + imm32) */ #define BPF_LD_IND(SIZE, SRC, IMM) \ ((struct bpf_insn) { \ .code = BPF_LD | BPF_SIZE(SIZE) | BPF_IND, \ .dst_reg = 0, \ .src_reg = SRC, \ .off = 0, \ .imm = IMM }) /* Memory load, dst_reg = *(uint *) (src_reg + off16) */ #define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* Memory store, *(uint *) (dst_reg + off16) = src_reg */ #define BPF_STX_MEM(SIZE, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* * Atomic operations: * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg * BPF_AND *(uint *) (dst_reg + off16) &= src_reg * BPF_OR *(uint *) (dst_reg + off16) |= src_reg * BPF_XOR *(uint *) (dst_reg + off16) ^= src_reg * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); * BPF_AND | BPF_FETCH src_reg = atomic_fetch_and(dst_reg + off16, src_reg); * BPF_OR | BPF_FETCH src_reg = atomic_fetch_or(dst_reg + off16, src_reg); * BPF_XOR | BPF_FETCH src_reg = atomic_fetch_xor(dst_reg + off16, src_reg); * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) */ #define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = OP }) /* Legacy alias */ #define BPF_STX_XADD(SIZE, DST, SRC, OFF) BPF_ATOMIC_OP(SIZE, BPF_ADD, DST, SRC, OFF) /* Memory store, *(uint *) (dst_reg + off16) = imm32 */ #define BPF_ST_MEM(SIZE, DST, OFF, IMM) \ ((struct bpf_insn) { \ .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) /* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */ #define BPF_JMP_REG(OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_OP(OP) | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */ #define BPF_JMP_IMM(OP, DST, IMM, OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) /* Like BPF_JMP_REG, but with 32-bit wide operands for comparison. */ #define BPF_JMP32_REG(OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP32 | BPF_OP(OP) | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* Like BPF_JMP_IMM, but with 32-bit wide operands for comparison. */ #define BPF_JMP32_IMM(OP, DST, IMM, OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP32 | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) /* Unconditional jumps, goto pc + off16 */ #define BPF_JMP_A(OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_JA, \ .dst_reg = 0, \ .src_reg = 0, \ .off = OFF, \ .imm = 0 }) /* Relative call */ #define BPF_CALL_REL(TGT) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_CALL, \ .dst_reg = 0, \ .src_reg = BPF_PSEUDO_CALL, \ .off = 0, \ .imm = TGT }) /* Convert function address to BPF immediate */ #define BPF_CALL_IMM(x) ((void *)(x) - (void *)__bpf_call_base) #define BPF_EMIT_CALL(FUNC) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_CALL, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = BPF_CALL_IMM(FUNC) }) /* Raw code statement block */ #define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \ ((struct bpf_insn) { \ .code = CODE, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = IMM }) /* Program exit */ #define BPF_EXIT_INSN() \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_EXIT, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = 0 }) /* Speculation barrier */ #define BPF_ST_NOSPEC() \ ((struct bpf_insn) { \ .code = BPF_ST | BPF_NOSPEC, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = 0 }) /* Internal classic blocks for direct assignment */ #define __BPF_STMT(CODE, K) \ ((struct sock_filter) BPF_STMT(CODE, K)) #define __BPF_JUMP(CODE, K, JT, JF) \ ((struct sock_filter) BPF_JUMP(CODE, K, JT, JF)) #define bytes_to_bpf_size(bytes) \ ({ \ int bpf_size = -EINVAL; \ \ if (bytes == sizeof(u8)) \ bpf_size = BPF_B; \ else if (bytes == sizeof(u16)) \ bpf_size = BPF_H; \ else if (bytes == sizeof(u32)) \ bpf_size = BPF_W; \ else if (bytes == sizeof(u64)) \ bpf_size = BPF_DW; \ \ bpf_size; \ }) #define bpf_size_to_bytes(bpf_size) \ ({ \ int bytes = -EINVAL; \ \ if (bpf_size == BPF_B) \ bytes = sizeof(u8); \ else if (bpf_size == BPF_H) \ bytes = sizeof(u16); \ else if (bpf_size == BPF_W) \ bytes = sizeof(u32); \ else if (bpf_size == BPF_DW) \ bytes = sizeof(u64); \ \ bytes; \ }) #define BPF_SIZEOF(type) \ ({ \ const int __size = bytes_to_bpf_size(sizeof(type)); \ BUILD_BUG_ON(__size < 0); \ __size; \ }) #define BPF_FIELD_SIZEOF(type, field) \ ({ \ const int __size = bytes_to_bpf_size(sizeof_field(type, field)); \ BUILD_BUG_ON(__size < 0); \ __size; \ }) #define BPF_LDST_BYTES(insn) \ ({ \ const int __size = bpf_size_to_bytes(BPF_SIZE((insn)->code)); \ WARN_ON(__size < 0); \ __size; \ }) #define __BPF_MAP_0(m, v, ...) v #define __BPF_MAP_1(m, v, t, a, ...) m(t, a) #define __BPF_MAP_2(m, v, t, a, ...) m(t, a), __BPF_MAP_1(m, v, __VA_ARGS__) #define __BPF_MAP_3(m, v, t, a, ...) m(t, a), __BPF_MAP_2(m, v, __VA_ARGS__) #define __BPF_MAP_4(m, v, t, a, ...) m(t, a), __BPF_MAP_3(m, v, __VA_ARGS__) #define __BPF_MAP_5(m, v, t, a, ...) m(t, a), __BPF_MAP_4(m, v, __VA_ARGS__) #define __BPF_REG_0(...) __BPF_PAD(5) #define __BPF_REG_1(...) __BPF_MAP(1, __VA_ARGS__), __BPF_PAD(4) #define __BPF_REG_2(...) __BPF_MAP(2, __VA_ARGS__), __BPF_PAD(3) #define __BPF_REG_3(...) __BPF_MAP(3, __VA_ARGS__), __BPF_PAD(2) #define __BPF_REG_4(...) __BPF_MAP(4, __VA_ARGS__), __BPF_PAD(1) #define __BPF_REG_5(...) __BPF_MAP(5, __VA_ARGS__) #define __BPF_MAP(n, ...) __BPF_MAP_##n(__VA_ARGS__) #define __BPF_REG(n, ...) __BPF_REG_##n(__VA_ARGS__) #define __BPF_CAST(t, a) \ (__force t) \ (__force \ typeof(__builtin_choose_expr(sizeof(t) == sizeof(unsigned long), \ (unsigned long)0, (t)0))) a #define __BPF_V void #define __BPF_N #define __BPF_DECL_ARGS(t, a) t a #define __BPF_DECL_REGS(t, a) u64 a #define __BPF_PAD(n) \ __BPF_MAP(n, __BPF_DECL_ARGS, __BPF_N, u64, __ur_1, u64, __ur_2, \ u64, __ur_3, u64, __ur_4, u64, __ur_5) #define BPF_CALL_x(x, attr, name, ...) \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ typedef u64 (*btf_##name)(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ attr u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)); \ attr u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)) \ { \ return ((btf_##name)____##name)(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\ } \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)) #define __NOATTR #define BPF_CALL_0(name, ...) BPF_CALL_x(0, __NOATTR, name, __VA_ARGS__) #define BPF_CALL_1(name, ...) BPF_CALL_x(1, __NOATTR, name, __VA_ARGS__) #define BPF_CALL_2(name, ...) BPF_CALL_x(2, __NOATTR, name, __VA_ARGS__) #define BPF_CALL_3(name, ...) BPF_CALL_x(3, __NOATTR, name, __VA_ARGS__) #define BPF_CALL_4(name, ...) BPF_CALL_x(4, __NOATTR, name, __VA_ARGS__) #define BPF_CALL_5(name, ...) BPF_CALL_x(5, __NOATTR, name, __VA_ARGS__) #define NOTRACE_BPF_CALL_1(name, ...) BPF_CALL_x(1, notrace, name, __VA_ARGS__) #define bpf_ctx_range(TYPE, MEMBER) \ offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1 #define bpf_ctx_range_till(TYPE, MEMBER1, MEMBER2) \ offsetof(TYPE, MEMBER1) ... offsetofend(TYPE, MEMBER2) - 1 #if BITS_PER_LONG == 64 # define bpf_ctx_range_ptr(TYPE, MEMBER) \ offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1 #else # define bpf_ctx_range_ptr(TYPE, MEMBER) \ offsetof(TYPE, MEMBER) ... offsetof(TYPE, MEMBER) + 8 - 1 #endif /* BITS_PER_LONG == 64 */ #define bpf_target_off(TYPE, MEMBER, SIZE, PTR_SIZE) \ ({ \ BUILD_BUG_ON(sizeof_field(TYPE, MEMBER) != (SIZE)); \ *(PTR_SIZE) = (SIZE); \ offsetof(TYPE, MEMBER); \ }) /* A struct sock_filter is architecture independent. */ struct compat_sock_fprog { u16 len; compat_uptr_t filter; /* struct sock_filter * */ }; struct sock_fprog_kern { u16 len; struct sock_filter *filter; }; /* Some arches need doubleword alignment for their instructions and/or data */ #define BPF_IMAGE_ALIGNMENT 8 struct bpf_binary_header { u32 size; u8 image[] __aligned(BPF_IMAGE_ALIGNMENT); }; struct bpf_prog_stats { u64_stats_t cnt; u64_stats_t nsecs; u64_stats_t misses; struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64)); struct sk_filter { refcount_t refcnt; struct rcu_head rcu; struct bpf_prog *prog; }; DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); extern struct mutex nf_conn_btf_access_lock; extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id, enum bpf_type_flag *flag); typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx, const struct bpf_insn *insnsi, unsigned int (*bpf_func)(const void *, const struct bpf_insn *)); static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog, const void *ctx, bpf_dispatcher_fn dfunc) { u32 ret; cant_migrate(); if (static_branch_unlikely(&bpf_stats_enabled_key)) { struct bpf_prog_stats *stats; u64 start = sched_clock(); unsigned long flags; ret = dfunc(ctx, prog->insnsi, prog->bpf_func); stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->cnt); u64_stats_add(&stats->nsecs, sched_clock() - start); u64_stats_update_end_irqrestore(&stats->syncp, flags); } else { ret = dfunc(ctx, prog->insnsi, prog->bpf_func); } return ret; } static __always_inline u32 bpf_prog_run(const struct bpf_prog *prog, const void *ctx) { return __bpf_prog_run(prog, ctx, bpf_dispatcher_nop_func); } /* * Use in preemptible and therefore migratable context to make sure that * the execution of the BPF program runs on one CPU. * * This uses migrate_disable/enable() explicitly to document that the * invocation of a BPF program does not require reentrancy protection * against a BPF program which is invoked from a preempting task. */ static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog, const void *ctx) { u32 ret; migrate_disable(); ret = bpf_prog_run(prog, ctx); migrate_enable(); return ret; } #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN struct bpf_skb_data_end { struct qdisc_skb_cb qdisc_cb; void *data_meta; void *data_end; }; struct bpf_nh_params { u32 nh_family; union { u32 ipv4_nh; struct in6_addr ipv6_nh; }; }; struct bpf_redirect_info { u32 flags; u32 tgt_index; void *tgt_value; struct bpf_map *map; u32 map_id; enum bpf_map_type map_type; u32 kern_flags; struct bpf_nh_params nh; }; DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); /* flags for bpf_redirect_info kern_flags */ #define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */ /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) * ensure that cb[] area can be written to when BPF program is * invoked (otherwise cb[] save/restore is necessary). */ static inline void bpf_compute_data_pointers(struct sk_buff *skb) { struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb)); cb->data_meta = skb->data - skb_metadata_len(skb); cb->data_end = skb->data + skb_headlen(skb); } /* Similar to bpf_compute_data_pointers(), except that save orginal * data in cb->data and cb->meta_data for restore. */ static inline void bpf_compute_and_save_data_end( struct sk_buff *skb, void **saved_data_end) { struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; *saved_data_end = cb->data_end; cb->data_end = skb->data + skb_headlen(skb); } /* Restore data saved by bpf_compute_data_pointers(). */ static inline void bpf_restore_data_end( struct sk_buff *skb, void *saved_data_end) { struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; cb->data_end = saved_data_end; } static inline u8 *bpf_skb_cb(const struct sk_buff *skb) { /* eBPF programs may read/write skb->cb[] area to transfer meta * data between tail calls. Since this also needs to work with * tc, that scratch memory is mapped to qdisc_skb_cb's data area. * * In some socket filter cases, the cb unfortunately needs to be * saved/restored so that protocol specific skb->cb[] data won't * be lost. In any case, due to unpriviledged eBPF programs * attached to sockets, we need to clear the bpf_skb_cb() area * to not leak previous contents to user space. */ BUILD_BUG_ON(sizeof_field(struct __sk_buff, cb) != BPF_SKB_CB_LEN); BUILD_BUG_ON(sizeof_field(struct __sk_buff, cb) != sizeof_field(struct qdisc_skb_cb, data)); return qdisc_skb_cb(skb)->data; } /* Must be invoked with migration disabled */ static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog, const void *ctx) { const struct sk_buff *skb = ctx; u8 *cb_data = bpf_skb_cb(skb); u8 cb_saved[BPF_SKB_CB_LEN]; u32 res; if (unlikely(prog->cb_access)) { memcpy(cb_saved, cb_data, sizeof(cb_saved)); memset(cb_data, 0, sizeof(cb_saved)); } res = bpf_prog_run(prog, skb); if (unlikely(prog->cb_access)) memcpy(cb_data, cb_saved, sizeof(cb_saved)); return res; } static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog, struct sk_buff *skb) { u32 res; migrate_disable(); res = __bpf_prog_run_save_cb(prog, skb); migrate_enable(); return res; } static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog, struct sk_buff *skb) { u8 *cb_data = bpf_skb_cb(skb); u32 res; if (unlikely(prog->cb_access)) memset(cb_data, 0, BPF_SKB_CB_LEN); res = bpf_prog_run_pin_on_cpu(prog, skb); return res; } DECLARE_BPF_DISPATCHER(xdp) DECLARE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key); u32 xdp_master_redirect(struct xdp_buff *xdp); static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, struct xdp_buff *xdp) { /* Driver XDP hooks are invoked within a single NAPI poll cycle and thus * under local_bh_disable(), which provides the needed RCU protection * for accessing map entries. */ u32 act = __bpf_prog_run(prog, xdp, BPF_DISPATCHER_FUNC(xdp)); if (static_branch_unlikely(&bpf_master_redirect_enabled_key)) { if (act == XDP_TX && netif_is_bond_slave(xdp->rxq->dev)) act = xdp_master_redirect(xdp); } return act; } void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog); static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog) { return prog->len * sizeof(struct bpf_insn); } static inline u32 bpf_prog_tag_scratch_size(const struct bpf_prog *prog) { return round_up(bpf_prog_insn_size(prog) + sizeof(__be64) + 1, SHA1_BLOCK_SIZE); } static inline unsigned int bpf_prog_size(unsigned int proglen) { return max(sizeof(struct bpf_prog), offsetof(struct bpf_prog, insns[proglen])); } static inline bool bpf_prog_was_classic(const struct bpf_prog *prog) { /* When classic BPF programs have been loaded and the arch * does not have a classic BPF JIT (anymore), they have been * converted via bpf_migrate_filter() to eBPF and thus always * have an unspec program type. */ return prog->type == BPF_PROG_TYPE_UNSPEC; } static inline u32 bpf_ctx_off_adjust_machine(u32 size) { const u32 size_machine = sizeof(unsigned long); if (size > size_machine && size % size_machine == 0) size = size_machine; return size; } static inline bool bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) { return size <= size_default && (size & (size - 1)) == 0; } static inline u8 bpf_ctx_narrow_access_offset(u32 off, u32 size, u32 size_default) { u8 access_off = off & (size_default - 1); #ifdef __LITTLE_ENDIAN return access_off; #else return size_default - (access_off + size); #endif } #define bpf_ctx_wide_access_ok(off, size, type, field) \ (size == sizeof(__u64) && \ off >= offsetof(type, field) && \ off + sizeof(__u64) <= offsetofend(type, field) && \ off % sizeof(__u64) == 0) #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) static inline int __must_check bpf_prog_lock_ro(struct bpf_prog *fp) { #ifndef CONFIG_BPF_JIT_ALWAYS_ON if (!fp->jited) { set_vm_flush_reset_perms(fp); return set_memory_ro((unsigned long)fp, fp->pages); } #endif return 0; } static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) { set_vm_flush_reset_perms(hdr); set_memory_ro((unsigned long)hdr, hdr->size >> PAGE_SHIFT); set_memory_x((unsigned long)hdr, hdr->size >> PAGE_SHIFT); } int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); static inline int sk_filter(struct sock *sk, struct sk_buff *skb) { return sk_filter_trim_cap(sk, skb, 1); } struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err); void bpf_prog_free(struct bpf_prog *fp); bool bpf_opcode_in_insntable(u8 code); void bpf_prog_free_linfo(struct bpf_prog *prog); void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, const u32 *insn_to_jit_off); int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog); void bpf_prog_jit_attempt_done(struct bpf_prog *prog); struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags); struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags); void __bpf_prog_free(struct bpf_prog *fp); static inline void bpf_prog_unlock_free(struct bpf_prog *fp) { __bpf_prog_free(fp); } typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter, unsigned int flen); int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog); int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, bpf_aux_classic_check_t trans, bool save_orig); void bpf_prog_destroy(struct bpf_prog *fp); int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk); void sk_reuseport_prog_free(struct bpf_prog *prog); int sk_detach_filter(struct sock *sk); int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len); bool sk_filter_charge(struct sock *sk, struct sk_filter *fp); void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); #define __bpf_call_base_args \ ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \ (void *)__bpf_call_base) struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); bool bpf_jit_needs_zext(void); bool bpf_jit_supports_subprog_tailcalls(void); bool bpf_jit_supports_kfunc_call(void); bool bpf_helper_changes_pkt_data(void *func); static inline bool bpf_dump_raw_ok(const struct cred *cred) { /* Reconstruction of call-sites is dependent on kallsyms, * thus make dump the same restriction. */ return kallsyms_show_value(cred); } struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt); void bpf_clear_redirect_map(struct bpf_map *map); static inline bool xdp_return_frame_no_direct(void) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT; } static inline void xdp_set_return_frame_no_direct(void) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT; } static inline void xdp_clear_return_frame_no_direct(void) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT; } static inline int xdp_ok_fwd_dev(const struct net_device *fwd, unsigned int pktlen) { unsigned int len; if (unlikely(!(fwd->flags & IFF_UP))) return -ENETDOWN; len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; if (pktlen > len) return -EMSGSIZE; return 0; } /* The pair of xdp_do_redirect and xdp_do_flush MUST be called in the * same cpu context. Further for best results no more than a single map * for the do_redirect/do_flush pair should be used. This limitation is * because we only track one map and force a flush when the map changes. * This does not appear to be a real limitation for existing software. */ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *prog); int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *prog); int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, struct xdp_frame *xdpf, struct bpf_prog *prog); void xdp_do_flush(void); /* The xdp_do_flush_map() helper has been renamed to drop the _map suffix, as * it is no longer only flushing maps. Keep this define for compatibility * until all drivers are updated - do not use xdp_do_flush_map() in new code! */ #define xdp_do_flush_map xdp_do_flush void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act); #ifdef CONFIG_INET struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, struct sock *migrating_sk, u32 hash); #else static inline struct sock * bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, struct sock *migrating_sk, u32 hash) { return NULL; } #endif #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; extern int bpf_jit_harden; extern int bpf_jit_kallsyms; extern long bpf_jit_limit; extern long bpf_jit_limit_max; typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); void bpf_jit_fill_hole_with_zero(void *area, unsigned int size); struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, bpf_jit_fill_hole_t bpf_fill_ill_insns); void bpf_jit_binary_free(struct bpf_binary_header *hdr); u64 bpf_jit_alloc_exec_limit(void); void *bpf_jit_alloc_exec(unsigned long size); void bpf_jit_free_exec(void *addr); void bpf_jit_free(struct bpf_prog *fp); struct bpf_binary_header * bpf_jit_binary_pack_hdr(const struct bpf_prog *fp); void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns); void bpf_prog_pack_free(struct bpf_binary_header *hdr); static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) { return list_empty(&fp->aux->ksym.lnode) || fp->aux->ksym.lnode.prev == LIST_POISON2; } struct bpf_binary_header * bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **ro_image, unsigned int alignment, struct bpf_binary_header **rw_hdr, u8 **rw_image, bpf_jit_fill_hole_t bpf_fill_ill_insns); int bpf_jit_binary_pack_finalize(struct bpf_prog *prog, struct bpf_binary_header *ro_header, struct bpf_binary_header *rw_header); void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, struct bpf_binary_header *rw_header); int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, struct bpf_jit_poke_descriptor *poke); int bpf_jit_get_func_addr(const struct bpf_prog *prog, const struct bpf_insn *insn, bool extra_pass, u64 *func_addr, bool *func_addr_fixed); struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp); void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other); static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, u32 pass, void *image) { pr_err("flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n", flen, proglen, pass, image, current->comm, task_pid_nr(current)); if (image) print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET, 16, 1, image, proglen, false); } static inline bool bpf_jit_is_ebpf(void) { # ifdef CONFIG_HAVE_EBPF_JIT return true; # else return false; # endif } static inline bool ebpf_jit_enabled(void) { return bpf_jit_enable && bpf_jit_is_ebpf(); } static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) { return fp->jited && bpf_jit_is_ebpf(); } static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) { /* These are the prerequisites, should someone ever have the * idea to call blinding outside of them, we make sure to * bail out. */ if (!bpf_jit_is_ebpf()) return false; if (!prog->jit_requested) return false; if (!bpf_jit_harden) return false; if (bpf_jit_harden == 1 && bpf_capable()) return false; return true; } static inline bool bpf_jit_kallsyms_enabled(void) { /* There are a couple of corner cases where kallsyms should * not be enabled f.e. on hardening. */ if (bpf_jit_harden) return false; if (!bpf_jit_kallsyms) return false; if (bpf_jit_kallsyms == 1) return true; return false; } const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym); bool is_bpf_text_address(unsigned long addr); int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym); static inline const char * bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym) { const char *ret = __bpf_address_lookup(addr, size, off, sym); if (ret && modname) *modname = NULL; return ret; } void bpf_prog_kallsyms_add(struct bpf_prog *fp); void bpf_prog_kallsyms_del(struct bpf_prog *fp); #else /* CONFIG_BPF_JIT */ static inline bool ebpf_jit_enabled(void) { return false; } static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) { return false; } static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) { return false; } static inline int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, struct bpf_jit_poke_descriptor *poke) { return -ENOTSUPP; } static inline void bpf_jit_free(struct bpf_prog *fp) { bpf_prog_unlock_free(fp); } static inline bool bpf_jit_kallsyms_enabled(void) { return false; } static inline const char * __bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym) { return NULL; } static inline bool is_bpf_text_address(unsigned long addr) { return false; } static inline int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym) { return -ERANGE; } static inline const char * bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym) { return NULL; } static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp) { } static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp) { } #endif /* CONFIG_BPF_JIT */ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp); #define BPF_ANC BIT(15) static inline bool bpf_needs_clear_a(const struct sock_filter *first) { switch (first->code) { case BPF_RET | BPF_K: case BPF_LD | BPF_W | BPF_LEN: return false; case BPF_LD | BPF_W | BPF_ABS: case BPF_LD | BPF_H | BPF_ABS: case BPF_LD | BPF_B | BPF_ABS: if (first->k == SKF_AD_OFF + SKF_AD_ALU_XOR_X) return true; return false; default: return true; } } static inline u16 bpf_anc_helper(const struct sock_filter *ftest) { BUG_ON(ftest->code & BPF_ANC); switch (ftest->code) { case BPF_LD | BPF_W | BPF_ABS: case BPF_LD | BPF_H | BPF_ABS: case BPF_LD | BPF_B | BPF_ABS: #define BPF_ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE: \ return BPF_ANC | SKF_AD_##CODE switch (ftest->k) { BPF_ANCILLARY(PROTOCOL); BPF_ANCILLARY(PKTTYPE); BPF_ANCILLARY(IFINDEX); BPF_ANCILLARY(NLATTR); BPF_ANCILLARY(NLATTR_NEST); BPF_ANCILLARY(MARK); BPF_ANCILLARY(QUEUE); BPF_ANCILLARY(HATYPE); BPF_ANCILLARY(RXHASH); BPF_ANCILLARY(CPU); BPF_ANCILLARY(ALU_XOR_X); BPF_ANCILLARY(VLAN_TAG); BPF_ANCILLARY(VLAN_TAG_PRESENT); BPF_ANCILLARY(PAY_OFFSET); BPF_ANCILLARY(RANDOM); BPF_ANCILLARY(VLAN_TPID); } fallthrough; default: return ftest->code; } } void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size); static inline int bpf_tell_extensions(void) { return SKF_AD_MAX; } struct bpf_sock_addr_kern { struct sock *sk; struct sockaddr *uaddr; /* Temporary "register" to make indirect stores to nested structures * defined above. We need three registers to make such a store, but * only two (src and dst) are available at convert_ctx_access time */ u64 tmp_reg; void *t_ctx; /* Attach type specific context. */ }; struct bpf_sock_ops_kern { struct sock *sk; union { u32 args[4]; u32 reply; u32 replylong[4]; }; struct sk_buff *syn_skb; struct sk_buff *skb; void *skb_data_end; u8 op; u8 is_fullsock; u8 remaining_opt_len; u64 temp; /* temp and everything after is not * initialized to 0 before calling * the BPF program. New fields that * should be initialized to 0 should * be inserted before temp. * temp is scratch storage used by * sock_ops_convert_ctx_access * as temporary storage of a register. */ }; struct bpf_sysctl_kern { struct ctl_table_header *head; struct ctl_table *table; void *cur_val; size_t cur_len; void *new_val; size_t new_len; int new_updated; int write; loff_t *ppos; /* Temporary "register" for indirect stores to ppos. */ u64 tmp_reg; }; #define BPF_SOCKOPT_KERN_BUF_SIZE 32 struct bpf_sockopt_buf { u8 data[BPF_SOCKOPT_KERN_BUF_SIZE]; }; struct bpf_sockopt_kern { struct sock *sk; u8 *optval; u8 *optval_end; s32 level; s32 optname; s32 optlen; /* for retval in struct bpf_cg_run_ctx */ struct task_struct *current_task; /* Temporary "register" for indirect stores to ppos. */ u64 tmp_reg; }; int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len); struct bpf_sk_lookup_kern { u16 family; u16 protocol; __be16 sport; u16 dport; struct { __be32 saddr; __be32 daddr; } v4; struct { const struct in6_addr *saddr; const struct in6_addr *daddr; } v6; struct sock *selected_sk; u32 ingress_ifindex; bool no_reuseport; }; extern struct static_key_false bpf_sk_lookup_enabled; /* Runners for BPF_SK_LOOKUP programs to invoke on socket lookup. * * Allowed return values for a BPF SK_LOOKUP program are SK_PASS and * SK_DROP. Their meaning is as follows: * * SK_PASS && ctx.selected_sk != NULL: use selected_sk as lookup result * SK_PASS && ctx.selected_sk == NULL: continue to htable-based socket lookup * SK_DROP : terminate lookup with -ECONNREFUSED * * This macro aggregates return values and selected sockets from * multiple BPF programs according to following rules in order: * * 1. If any program returned SK_PASS and a non-NULL ctx.selected_sk, * macro result is SK_PASS and last ctx.selected_sk is used. * 2. If any program returned SK_DROP return value, * macro result is SK_DROP. * 3. Otherwise result is SK_PASS and ctx.selected_sk is NULL. * * Caller must ensure that the prog array is non-NULL, and that the * array as well as the programs it contains remain valid. */ #define BPF_PROG_SK_LOOKUP_RUN_ARRAY(array, ctx, func) \ ({ \ struct bpf_sk_lookup_kern *_ctx = &(ctx); \ struct bpf_prog_array_item *_item; \ struct sock *_selected_sk = NULL; \ bool _no_reuseport = false; \ struct bpf_prog *_prog; \ bool _all_pass = true; \ u32 _ret; \ \ migrate_disable(); \ _item = &(array)->items[0]; \ while ((_prog = READ_ONCE(_item->prog))) { \ /* restore most recent selection */ \ _ctx->selected_sk = _selected_sk; \ _ctx->no_reuseport = _no_reuseport; \ \ _ret = func(_prog, _ctx); \ if (_ret == SK_PASS && _ctx->selected_sk) { \ /* remember last non-NULL socket */ \ _selected_sk = _ctx->selected_sk; \ _no_reuseport = _ctx->no_reuseport; \ } else if (_ret == SK_DROP && _all_pass) { \ _all_pass = false; \ } \ _item++; \ } \ _ctx->selected_sk = _selected_sk; \ _ctx->no_reuseport = _no_reuseport; \ migrate_enable(); \ _all_pass || _selected_sk ? SK_PASS : SK_DROP; \ }) static inline bool bpf_sk_lookup_run_v4(struct net *net, int protocol, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 dport, const int ifindex, struct sock **psk) { struct bpf_prog_array *run_array; struct sock *selected_sk = NULL; bool no_reuseport = false; rcu_read_lock(); run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]); if (run_array) { struct bpf_sk_lookup_kern ctx = { .family = AF_INET, .protocol = protocol, .v4.saddr = saddr, .v4.daddr = daddr, .sport = sport, .dport = dport, .ingress_ifindex = ifindex, }; u32 act; act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run); if (act == SK_PASS) { selected_sk = ctx.selected_sk; no_reuseport = ctx.no_reuseport; } else { selected_sk = ERR_PTR(-ECONNREFUSED); } } rcu_read_unlock(); *psk = selected_sk; return no_reuseport; } #if IS_ENABLED(CONFIG_IPV6) static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const u16 dport, const int ifindex, struct sock **psk) { struct bpf_prog_array *run_array; struct sock *selected_sk = NULL; bool no_reuseport = false; rcu_read_lock(); run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]); if (run_array) { struct bpf_sk_lookup_kern ctx = { .family = AF_INET6, .protocol = protocol, .v6.saddr = saddr, .v6.daddr = daddr, .sport = sport, .dport = dport, .ingress_ifindex = ifindex, }; u32 act; act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run); if (act == SK_PASS) { selected_sk = ctx.selected_sk; no_reuseport = ctx.no_reuseport; } else { selected_sk = ERR_PTR(-ECONNREFUSED); } } rcu_read_unlock(); *psk = selected_sk; return no_reuseport; } #endif /* IS_ENABLED(CONFIG_IPV6) */ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex, u64 flags, const u64 flag_mask, void *lookup_elem(struct bpf_map *map, u32 key)) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX; /* Lower bits of the flags are used as return code on lookup failure */ if (unlikely(flags & ~(action_mask | flag_mask))) return XDP_ABORTED; ri->tgt_value = lookup_elem(map, ifindex); if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) { /* If the lookup fails we want to clear out the state in the * redirect_info struct completely, so that if an eBPF program * performs multiple lookups, the last one always takes * precedence. */ ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */ ri->map_type = BPF_MAP_TYPE_UNSPEC; return flags & action_mask; } ri->tgt_index = ifindex; ri->map_id = map->id; ri->map_type = map->map_type; if (flags & BPF_F_BROADCAST) { WRITE_ONCE(ri->map, map); ri->flags = flags; } else { WRITE_ONCE(ri->map, NULL); ri->flags = 0; } return XDP_REDIRECT; } #endif /* __LINUX_FILTER_H__ */ |
613 613 374 240 605 8 8 605 1027 342 613 277 98 142 101 198 418 528 2 9 1 6 2 8 535 530 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 | // SPDX-License-Identifier: GPL-2.0-only /* * * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/inet.h> #include <linux/pagemap.h> #include <linux/mount.h> #include <linux/idr.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/statfs.h> #include <linux/magic.h> #include <linux/fscache.h> #include <net/9p/9p.h> #include <net/9p/client.h> #include "v9fs.h" #include "v9fs_vfs.h" #include "fid.h" #include "xattr.h" #include "acl.h" static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl; /** * v9fs_set_super - set the superblock * @s: super block * @data: file system specific data * */ static int v9fs_set_super(struct super_block *s, void *data) { s->s_fs_info = data; return set_anon_super(s, data); } /** * v9fs_fill_super - populate superblock with info * @sb: superblock * @v9ses: session information * @flags: flags propagated from v9fs_mount() * */ static int v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, int flags) { int ret; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize_bits = fls(v9ses->maxdata - 1); sb->s_blocksize = 1 << sb->s_blocksize_bits; sb->s_magic = V9FS_MAGIC; if (v9fs_proto_dotl(v9ses)) { sb->s_op = &v9fs_super_ops_dotl; sb->s_xattr = v9fs_xattr_handlers; } else { sb->s_op = &v9fs_super_ops; sb->s_time_max = U32_MAX; } sb->s_time_min = 0; ret = super_setup_bdi(sb); if (ret) return ret; if (!v9ses->cache) { sb->s_bdi->ra_pages = 0; sb->s_bdi->io_pages = 0; } else { sb->s_bdi->ra_pages = v9ses->maxdata >> PAGE_SHIFT; sb->s_bdi->io_pages = v9ses->maxdata >> PAGE_SHIFT; } sb->s_flags |= SB_ACTIVE | SB_DIRSYNC; if (!v9ses->cache) sb->s_flags |= SB_SYNCHRONOUS; #ifdef CONFIG_9P_FS_POSIX_ACL if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL) sb->s_flags |= SB_POSIXACL; #endif return 0; } /** * v9fs_mount - mount a superblock * @fs_type: file system type * @flags: mount flags * @dev_name: device name that was mounted * @data: mount options * */ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { struct super_block *sb = NULL; struct inode *inode = NULL; struct dentry *root = NULL; struct v9fs_session_info *v9ses = NULL; umode_t mode = 0777 | S_ISVTX; struct p9_fid *fid; int retval = 0; p9_debug(P9_DEBUG_VFS, "\n"); v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL); if (!v9ses) return ERR_PTR(-ENOMEM); fid = v9fs_session_init(v9ses, dev_name, data); if (IS_ERR(fid)) { retval = PTR_ERR(fid); goto free_session; } sb = sget(fs_type, NULL, v9fs_set_super, flags, v9ses); if (IS_ERR(sb)) { retval = PTR_ERR(sb); goto clunk_fid; } retval = v9fs_fill_super(sb, v9ses, flags); if (retval) goto release_sb; if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) sb->s_d_op = &v9fs_cached_dentry_operations; else sb->s_d_op = &v9fs_dentry_operations; inode = v9fs_get_inode(sb, S_IFDIR | mode, 0); if (IS_ERR(inode)) { retval = PTR_ERR(inode); goto release_sb; } root = d_make_root(inode); if (!root) { retval = -ENOMEM; goto release_sb; } sb->s_root = root; if (v9fs_proto_dotl(v9ses)) { struct p9_stat_dotl *st = NULL; st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); if (IS_ERR(st)) { retval = PTR_ERR(st); goto release_sb; } d_inode(root)->i_ino = v9fs_qid2ino(&st->qid); v9fs_stat2inode_dotl(st, d_inode(root), 0); kfree(st); } else { struct p9_wstat *st = NULL; st = p9_client_stat(fid); if (IS_ERR(st)) { retval = PTR_ERR(st); goto release_sb; } d_inode(root)->i_ino = v9fs_qid2ino(&st->qid); v9fs_stat2inode(st, d_inode(root), sb, 0); p9stat_free(st); kfree(st); } retval = v9fs_get_acl(inode, fid); if (retval) goto release_sb; v9fs_fid_add(root, &fid); p9_debug(P9_DEBUG_VFS, " simple set mount, return 0\n"); return dget(sb->s_root); clunk_fid: p9_fid_put(fid); v9fs_session_close(v9ses); free_session: kfree(v9ses); return ERR_PTR(retval); release_sb: /* * we will do the session_close and root dentry release * in the below call. But we need to clunk fid, because we haven't * attached the fid to dentry so it won't get clunked * automatically. */ p9_fid_put(fid); deactivate_locked_super(sb); return ERR_PTR(retval); } /** * v9fs_kill_super - Kill Superblock * @s: superblock * */ static void v9fs_kill_super(struct super_block *s) { struct v9fs_session_info *v9ses = s->s_fs_info; p9_debug(P9_DEBUG_VFS, " %p\n", s); kill_anon_super(s); v9fs_session_cancel(v9ses); v9fs_session_close(v9ses); kfree(v9ses); s->s_fs_info = NULL; p9_debug(P9_DEBUG_VFS, "exiting kill_super\n"); } static void v9fs_umount_begin(struct super_block *sb) { struct v9fs_session_info *v9ses; v9ses = sb->s_fs_info; v9fs_session_begin_cancel(v9ses); } static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct v9fs_session_info *v9ses; struct p9_fid *fid; struct p9_rstatfs rs; int res; fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) { res = PTR_ERR(fid); goto done; } v9ses = v9fs_dentry2v9ses(dentry); if (v9fs_proto_dotl(v9ses)) { res = p9_client_statfs(fid, &rs); if (res == 0) { buf->f_type = rs.type; buf->f_bsize = rs.bsize; buf->f_blocks = rs.blocks; buf->f_bfree = rs.bfree; buf->f_bavail = rs.bavail; buf->f_files = rs.files; buf->f_ffree = rs.ffree; buf->f_fsid = u64_to_fsid(rs.fsid); buf->f_namelen = rs.namelen; } if (res != -ENOSYS) goto done; } res = simple_statfs(dentry, buf); done: p9_fid_put(fid); return res; } static int v9fs_drop_inode(struct inode *inode) { struct v9fs_session_info *v9ses; v9ses = v9fs_inode2v9ses(inode); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) return generic_drop_inode(inode); /* * in case of non cached mode always drop the * inode because we want the inode attribute * to always match that on the server. */ return 1; } static int v9fs_write_inode(struct inode *inode, struct writeback_control *wbc) { int ret; struct p9_wstat wstat; struct v9fs_inode *v9inode; /* * send an fsync request to server irrespective of * wbc->sync_mode. */ p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); v9inode = V9FS_I(inode); if (!v9inode->writeback_fid) return 0; v9fs_blank_wstat(&wstat); ret = p9_client_wstat(v9inode->writeback_fid, &wstat); if (ret < 0) { __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return ret; } fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode)); return 0; } static int v9fs_write_inode_dotl(struct inode *inode, struct writeback_control *wbc) { int ret; struct v9fs_inode *v9inode; /* * send an fsync request to server irrespective of * wbc->sync_mode. */ v9inode = V9FS_I(inode); p9_debug(P9_DEBUG_VFS, "%s: inode %p, writeback_fid %p\n", __func__, inode, v9inode->writeback_fid); if (!v9inode->writeback_fid) return 0; ret = p9_client_fsync(v9inode->writeback_fid, 0); if (ret < 0) { __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return ret; } fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode)); return 0; } static const struct super_operations v9fs_super_ops = { .alloc_inode = v9fs_alloc_inode, .free_inode = v9fs_free_inode, .statfs = simple_statfs, .drop_inode = v9fs_drop_inode, .evict_inode = v9fs_evict_inode, .show_options = v9fs_show_options, .umount_begin = v9fs_umount_begin, .write_inode = v9fs_write_inode, }; static const struct super_operations v9fs_super_ops_dotl = { .alloc_inode = v9fs_alloc_inode, .free_inode = v9fs_free_inode, .statfs = v9fs_statfs, .drop_inode = v9fs_drop_inode, .evict_inode = v9fs_evict_inode, .show_options = v9fs_show_options, .umount_begin = v9fs_umount_begin, .write_inode = v9fs_write_inode_dotl, }; struct file_system_type v9fs_fs_type = { .name = "9p", .mount = v9fs_mount, .kill_sb = v9fs_kill_super, .owner = THIS_MODULE, .fs_flags = FS_RENAME_DOES_D_MOVE, }; MODULE_ALIAS_FS("9p"); |
1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 | // SPDX-License-Identifier: GPL-2.0 /* * USB Serial Converter Generic functions * * Copyright (C) 2010 - 2013 Johan Hovold (jhovold@gmail.com) * Copyright (C) 1999 - 2002 Greg Kroah-Hartman (greg@kroah.com) */ #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/sysrq.h> #include <linux/tty.h> #include <linux/tty_flip.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/usb.h> #include <linux/usb/serial.h> #include <linux/uaccess.h> #include <linux/kfifo.h> #include <linux/serial.h> #ifdef CONFIG_USB_SERIAL_GENERIC static __u16 vendor = 0x05f9; static __u16 product = 0xffff; module_param(vendor, ushort, 0); MODULE_PARM_DESC(vendor, "User specified USB idVendor"); module_param(product, ushort, 0); MODULE_PARM_DESC(product, "User specified USB idProduct"); static struct usb_device_id generic_device_ids[2]; /* Initially all zeroes. */ static int usb_serial_generic_probe(struct usb_serial *serial, const struct usb_device_id *id) { struct device *dev = &serial->interface->dev; dev_info(dev, "The \"generic\" usb-serial driver is only for testing and one-off prototypes.\n"); dev_info(dev, "Tell linux-usb@vger.kernel.org to add your device to a proper driver.\n"); return 0; } static int usb_serial_generic_calc_num_ports(struct usb_serial *serial, struct usb_serial_endpoints *epds) { struct device *dev = &serial->interface->dev; int num_ports; num_ports = max(epds->num_bulk_in, epds->num_bulk_out); if (num_ports == 0) { dev_err(dev, "device has no bulk endpoints\n"); return -ENODEV; } return num_ports; } static struct usb_serial_driver usb_serial_generic_device = { .driver = { .owner = THIS_MODULE, .name = "generic", }, .id_table = generic_device_ids, .probe = usb_serial_generic_probe, .calc_num_ports = usb_serial_generic_calc_num_ports, .throttle = usb_serial_generic_throttle, .unthrottle = usb_serial_generic_unthrottle, .resume = usb_serial_generic_resume, }; static struct usb_serial_driver * const serial_drivers[] = { &usb_serial_generic_device, NULL }; #endif int usb_serial_generic_register(void) { int retval = 0; #ifdef CONFIG_USB_SERIAL_GENERIC generic_device_ids[0].idVendor = vendor; generic_device_ids[0].idProduct = product; generic_device_ids[0].match_flags = USB_DEVICE_ID_MATCH_VENDOR | USB_DEVICE_ID_MATCH_PRODUCT; retval = usb_serial_register_drivers(serial_drivers, "usbserial_generic", generic_device_ids); #endif return retval; } void usb_serial_generic_deregister(void) { #ifdef CONFIG_USB_SERIAL_GENERIC usb_serial_deregister_drivers(serial_drivers); #endif } int usb_serial_generic_open(struct tty_struct *tty, struct usb_serial_port *port) { int result = 0; clear_bit(USB_SERIAL_THROTTLED, &port->flags); if (port->bulk_in_size) result = usb_serial_generic_submit_read_urbs(port, GFP_KERNEL); return result; } EXPORT_SYMBOL_GPL(usb_serial_generic_open); void usb_serial_generic_close(struct usb_serial_port *port) { unsigned long flags; int i; if (port->bulk_out_size) { for (i = 0; i < ARRAY_SIZE(port->write_urbs); ++i) usb_kill_urb(port->write_urbs[i]); spin_lock_irqsave(&port->lock, flags); kfifo_reset_out(&port->write_fifo); spin_unlock_irqrestore(&port->lock, flags); } if (port->bulk_in_size) { for (i = 0; i < ARRAY_SIZE(port->read_urbs); ++i) usb_kill_urb(port->read_urbs[i]); } } EXPORT_SYMBOL_GPL(usb_serial_generic_close); int usb_serial_generic_prepare_write_buffer(struct usb_serial_port *port, void *dest, size_t size) { return kfifo_out_locked(&port->write_fifo, dest, size, &port->lock); } /** * usb_serial_generic_write_start - start writing buffered data * @port: usb-serial port * @mem_flags: flags to use for memory allocations * * Serialised using USB_SERIAL_WRITE_BUSY flag. * * Return: Zero on success or if busy, otherwise a negative errno value. */ int usb_serial_generic_write_start(struct usb_serial_port *port, gfp_t mem_flags) { struct urb *urb; int count, result; unsigned long flags; int i; if (test_and_set_bit_lock(USB_SERIAL_WRITE_BUSY, &port->flags)) return 0; retry: spin_lock_irqsave(&port->lock, flags); if (!port->write_urbs_free || !kfifo_len(&port->write_fifo)) { clear_bit_unlock(USB_SERIAL_WRITE_BUSY, &port->flags); spin_unlock_irqrestore(&port->lock, flags); return 0; } i = (int)find_first_bit(&port->write_urbs_free, ARRAY_SIZE(port->write_urbs)); spin_unlock_irqrestore(&port->lock, flags); urb = port->write_urbs[i]; count = port->serial->type->prepare_write_buffer(port, urb->transfer_buffer, port->bulk_out_size); urb->transfer_buffer_length = count; usb_serial_debug_data(&port->dev, __func__, count, urb->transfer_buffer); spin_lock_irqsave(&port->lock, flags); port->tx_bytes += count; spin_unlock_irqrestore(&port->lock, flags); clear_bit(i, &port->write_urbs_free); result = usb_submit_urb(urb, mem_flags); if (result) { dev_err_console(port, "%s - error submitting urb: %d\n", __func__, result); set_bit(i, &port->write_urbs_free); spin_lock_irqsave(&port->lock, flags); port->tx_bytes -= count; spin_unlock_irqrestore(&port->lock, flags); clear_bit_unlock(USB_SERIAL_WRITE_BUSY, &port->flags); return result; } goto retry; /* try sending off another urb */ } EXPORT_SYMBOL_GPL(usb_serial_generic_write_start); /** * usb_serial_generic_write - generic write function * @tty: tty for the port * @port: usb-serial port * @buf: data to write * @count: number of bytes to write * * Return: The number of characters buffered, which may be anything from * zero to @count, or a negative errno value. */ int usb_serial_generic_write(struct tty_struct *tty, struct usb_serial_port *port, const unsigned char *buf, int count) { int result; if (!port->bulk_out_size) return -ENODEV; if (!count) return 0; count = kfifo_in_locked(&port->write_fifo, buf, count, &port->lock); result = usb_serial_generic_write_start(port, GFP_ATOMIC); if (result) return result; return count; } EXPORT_SYMBOL_GPL(usb_serial_generic_write); unsigned int usb_serial_generic_write_room(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; unsigned long flags; unsigned int room; if (!port->bulk_out_size) return 0; spin_lock_irqsave(&port->lock, flags); room = kfifo_avail(&port->write_fifo); spin_unlock_irqrestore(&port->lock, flags); dev_dbg(&port->dev, "%s - returns %u\n", __func__, room); return room; } unsigned int usb_serial_generic_chars_in_buffer(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; unsigned long flags; unsigned int chars; if (!port->bulk_out_size) return 0; spin_lock_irqsave(&port->lock, flags); chars = kfifo_len(&port->write_fifo) + port->tx_bytes; spin_unlock_irqrestore(&port->lock, flags); dev_dbg(&port->dev, "%s - returns %u\n", __func__, chars); return chars; } EXPORT_SYMBOL_GPL(usb_serial_generic_chars_in_buffer); void usb_serial_generic_wait_until_sent(struct tty_struct *tty, long timeout) { struct usb_serial_port *port = tty->driver_data; unsigned int bps; unsigned long period; unsigned long expire; bps = tty_get_baud_rate(tty); if (!bps) bps = 9600; /* B0 */ /* * Use a poll-period of roughly the time it takes to send one * character or at least one jiffy. */ period = max_t(unsigned long, (10 * HZ / bps), 1); if (timeout) period = min_t(unsigned long, period, timeout); dev_dbg(&port->dev, "%s - timeout = %u ms, period = %u ms\n", __func__, jiffies_to_msecs(timeout), jiffies_to_msecs(period)); expire = jiffies + timeout; while (!port->serial->type->tx_empty(port)) { schedule_timeout_interruptible(period); if (signal_pending(current)) break; if (timeout && time_after(jiffies, expire)) break; } } EXPORT_SYMBOL_GPL(usb_serial_generic_wait_until_sent); static int usb_serial_generic_submit_read_urb(struct usb_serial_port *port, int index, gfp_t mem_flags) { int res; if (!test_and_clear_bit(index, &port->read_urbs_free)) return 0; dev_dbg(&port->dev, "%s - urb %d\n", __func__, index); res = usb_submit_urb(port->read_urbs[index], mem_flags); if (res) { if (res != -EPERM && res != -ENODEV) { dev_err(&port->dev, "%s - usb_submit_urb failed: %d\n", __func__, res); } set_bit(index, &port->read_urbs_free); return res; } return 0; } int usb_serial_generic_submit_read_urbs(struct usb_serial_port *port, gfp_t mem_flags) { int res; int i; for (i = 0; i < ARRAY_SIZE(port->read_urbs); ++i) { res = usb_serial_generic_submit_read_urb(port, i, mem_flags); if (res) goto err; } return 0; err: for (; i >= 0; --i) usb_kill_urb(port->read_urbs[i]); return res; } EXPORT_SYMBOL_GPL(usb_serial_generic_submit_read_urbs); void usb_serial_generic_process_read_urb(struct urb *urb) { struct usb_serial_port *port = urb->context; char *ch = urb->transfer_buffer; int i; if (!urb->actual_length) return; /* * The per character mucking around with sysrq path it too slow for * stuff like 3G modems, so shortcircuit it in the 99.9999999% of * cases where the USB serial is not a console anyway. */ if (port->sysrq) { for (i = 0; i < urb->actual_length; i++, ch++) { if (!usb_serial_handle_sysrq_char(port, *ch)) tty_insert_flip_char(&port->port, *ch, TTY_NORMAL); } } else { tty_insert_flip_string(&port->port, ch, urb->actual_length); } tty_flip_buffer_push(&port->port); } EXPORT_SYMBOL_GPL(usb_serial_generic_process_read_urb); void usb_serial_generic_read_bulk_callback(struct urb *urb) { struct usb_serial_port *port = urb->context; unsigned char *data = urb->transfer_buffer; bool stopped = false; int status = urb->status; int i; for (i = 0; i < ARRAY_SIZE(port->read_urbs); ++i) { if (urb == port->read_urbs[i]) break; } dev_dbg(&port->dev, "%s - urb %d, len %d\n", __func__, i, urb->actual_length); switch (status) { case 0: usb_serial_debug_data(&port->dev, __func__, urb->actual_length, data); port->serial->type->process_read_urb(urb); break; case -ENOENT: case -ECONNRESET: case -ESHUTDOWN: dev_dbg(&port->dev, "%s - urb stopped: %d\n", __func__, status); stopped = true; break; case -EPIPE: dev_err(&port->dev, "%s - urb stopped: %d\n", __func__, status); stopped = true; break; default: dev_dbg(&port->dev, "%s - nonzero urb status: %d\n", __func__, status); break; } /* * Make sure URB processing is done before marking as free to avoid * racing with unthrottle() on another CPU. Matches the barriers * implied by the test_and_clear_bit() in * usb_serial_generic_submit_read_urb(). */ smp_mb__before_atomic(); set_bit(i, &port->read_urbs_free); /* * Make sure URB is marked as free before checking the throttled flag * to avoid racing with unthrottle() on another CPU. Matches the * smp_mb__after_atomic() in unthrottle(). */ smp_mb__after_atomic(); if (stopped) return; if (test_bit(USB_SERIAL_THROTTLED, &port->flags)) return; usb_serial_generic_submit_read_urb(port, i, GFP_ATOMIC); } EXPORT_SYMBOL_GPL(usb_serial_generic_read_bulk_callback); void usb_serial_generic_write_bulk_callback(struct urb *urb) { unsigned long flags; struct usb_serial_port *port = urb->context; int status = urb->status; int i; for (i = 0; i < ARRAY_SIZE(port->write_urbs); ++i) { if (port->write_urbs[i] == urb) break; } spin_lock_irqsave(&port->lock, flags); port->tx_bytes -= urb->transfer_buffer_length; set_bit(i, &port->write_urbs_free); spin_unlock_irqrestore(&port->lock, flags); switch (status) { case 0: break; case -ENOENT: case -ECONNRESET: case -ESHUTDOWN: dev_dbg(&port->dev, "%s - urb stopped: %d\n", __func__, status); return; case -EPIPE: dev_err_console(port, "%s - urb stopped: %d\n", __func__, status); return; default: dev_err_console(port, "%s - nonzero urb status: %d\n", __func__, status); break; } usb_serial_generic_write_start(port, GFP_ATOMIC); usb_serial_port_softint(port); } EXPORT_SYMBOL_GPL(usb_serial_generic_write_bulk_callback); void usb_serial_generic_throttle(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; set_bit(USB_SERIAL_THROTTLED, &port->flags); } EXPORT_SYMBOL_GPL(usb_serial_generic_throttle); void usb_serial_generic_unthrottle(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; clear_bit(USB_SERIAL_THROTTLED, &port->flags); /* * Matches the smp_mb__after_atomic() in * usb_serial_generic_read_bulk_callback(). */ smp_mb__after_atomic(); usb_serial_generic_submit_read_urbs(port, GFP_KERNEL); } EXPORT_SYMBOL_GPL(usb_serial_generic_unthrottle); static bool usb_serial_generic_msr_changed(struct tty_struct *tty, unsigned long arg, struct async_icount *cprev) { struct usb_serial_port *port = tty->driver_data; struct async_icount cnow; unsigned long flags; bool ret; /* * Use tty-port initialised flag to detect all hangups including the * one generated at USB-device disconnect. */ if (!tty_port_initialized(&port->port)) return true; spin_lock_irqsave(&port->lock, flags); cnow = port->icount; /* atomic copy*/ spin_unlock_irqrestore(&port->lock, flags); ret = ((arg & TIOCM_RNG) && (cnow.rng != cprev->rng)) || ((arg & TIOCM_DSR) && (cnow.dsr != cprev->dsr)) || ((arg & TIOCM_CD) && (cnow.dcd != cprev->dcd)) || ((arg & TIOCM_CTS) && (cnow.cts != cprev->cts)); *cprev = cnow; return ret; } int usb_serial_generic_tiocmiwait(struct tty_struct *tty, unsigned long arg) { struct usb_serial_port *port = tty->driver_data; struct async_icount cnow; unsigned long flags; int ret; spin_lock_irqsave(&port->lock, flags); cnow = port->icount; /* atomic copy */ spin_unlock_irqrestore(&port->lock, flags); ret = wait_event_interruptible(port->port.delta_msr_wait, usb_serial_generic_msr_changed(tty, arg, &cnow)); if (!ret && !tty_port_initialized(&port->port)) ret = -EIO; return ret; } EXPORT_SYMBOL_GPL(usb_serial_generic_tiocmiwait); int usb_serial_generic_get_icount(struct tty_struct *tty, struct serial_icounter_struct *icount) { struct usb_serial_port *port = tty->driver_data; struct async_icount cnow; unsigned long flags; spin_lock_irqsave(&port->lock, flags); cnow = port->icount; /* atomic copy */ spin_unlock_irqrestore(&port->lock, flags); icount->cts = cnow.cts; icount->dsr = cnow.dsr; icount->rng = cnow.rng; icount->dcd = cnow.dcd; icount->tx = cnow.tx; icount->rx = cnow.rx; icount->frame = cnow.frame; icount->parity = cnow.parity; icount->overrun = cnow.overrun; icount->brk = cnow.brk; icount->buf_overrun = cnow.buf_overrun; return 0; } EXPORT_SYMBOL_GPL(usb_serial_generic_get_icount); #if defined(CONFIG_USB_SERIAL_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ) int usb_serial_handle_sysrq_char(struct usb_serial_port *port, unsigned int ch) { if (port->sysrq) { if (ch && time_before(jiffies, port->sysrq)) { handle_sysrq(ch); port->sysrq = 0; return 1; } port->sysrq = 0; } return 0; } EXPORT_SYMBOL_GPL(usb_serial_handle_sysrq_char); int usb_serial_handle_break(struct usb_serial_port *port) { if (!port->port.console) return 0; if (!port->sysrq) { port->sysrq = jiffies + HZ*5; return 1; } port->sysrq = 0; return 0; } EXPORT_SYMBOL_GPL(usb_serial_handle_break); #endif /** * usb_serial_handle_dcd_change - handle a change of carrier detect state * @port: usb-serial port * @tty: tty for the port * @status: new carrier detect status, nonzero if active */ void usb_serial_handle_dcd_change(struct usb_serial_port *port, struct tty_struct *tty, unsigned int status) { dev_dbg(&port->dev, "%s - status %d\n", __func__, status); if (tty) { struct tty_ldisc *ld = tty_ldisc_ref(tty); if (ld) { if (ld->ops->dcd_change) ld->ops->dcd_change(tty, status); tty_ldisc_deref(ld); } } if (status) wake_up_interruptible(&port->port.open_wait); else if (tty && !C_CLOCAL(tty)) tty_hangup(tty); } EXPORT_SYMBOL_GPL(usb_serial_handle_dcd_change); int usb_serial_generic_resume(struct usb_serial *serial) { struct usb_serial_port *port; int i, c = 0, r; for (i = 0; i < serial->num_ports; i++) { port = serial->port[i]; if (!tty_port_initialized(&port->port)) continue; if (port->bulk_in_size) { r = usb_serial_generic_submit_read_urbs(port, GFP_NOIO); if (r < 0) c++; } if (port->bulk_out_size) { r = usb_serial_generic_write_start(port, GFP_NOIO); if (r < 0) c++; } } return c ? -EIO : 0; } EXPORT_SYMBOL_GPL(usb_serial_generic_resume); |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 | // SPDX-License-Identifier: GPL-2.0-only /*************************************************************************** * Copyright (C) 2010-2012 by Bruno Prémont <bonbons@linux-vserver.org> * * * * Based on Logitech G13 driver (v0.4) * * Copyright (C) 2009 by Rick L. Vinyard, Jr. <rvinyard@cs.nmsu.edu> * * * ***************************************************************************/ #include <linux/hid.h> #include <linux/hid-debug.h> #include <linux/input.h> #include "hid-ids.h" #include <linux/fb.h> #include <linux/vmalloc.h> #include <linux/backlight.h> #include <linux/lcd.h> #include <linux/leds.h> #include <linux/seq_file.h> #include <linux/debugfs.h> #include <linux/completion.h> #include <linux/uaccess.h> #include <linux/module.h> #include "hid-picolcd.h" void picolcd_leds_set(struct picolcd_data *data) { struct hid_report *report; unsigned long flags; if (!data->led[0]) return; report = picolcd_out_report(REPORT_LED_STATE, data->hdev); if (!report || report->maxfield != 1 || report->field[0]->report_count != 1) return; spin_lock_irqsave(&data->lock, flags); hid_set_field(report->field[0], 0, data->led_state); if (!(data->status & PICOLCD_FAILED)) hid_hw_request(data->hdev, report, HID_REQ_SET_REPORT); spin_unlock_irqrestore(&data->lock, flags); } static void picolcd_led_set_brightness(struct led_classdev *led_cdev, enum led_brightness value) { struct device *dev; struct hid_device *hdev; struct picolcd_data *data; int i, state = 0; dev = led_cdev->dev->parent; hdev = to_hid_device(dev); data = hid_get_drvdata(hdev); if (!data) return; for (i = 0; i < 8; i++) { if (led_cdev != data->led[i]) continue; state = (data->led_state >> i) & 1; if (value == LED_OFF && state) { data->led_state &= ~(1 << i); picolcd_leds_set(data); } else if (value != LED_OFF && !state) { data->led_state |= 1 << i; picolcd_leds_set(data); } break; } } static enum led_brightness picolcd_led_get_brightness(struct led_classdev *led_cdev) { struct device *dev; struct hid_device *hdev; struct picolcd_data *data; int i, value = 0; dev = led_cdev->dev->parent; hdev = to_hid_device(dev); data = hid_get_drvdata(hdev); for (i = 0; i < 8; i++) if (led_cdev == data->led[i]) { value = (data->led_state >> i) & 1; break; } return value ? LED_FULL : LED_OFF; } int picolcd_init_leds(struct picolcd_data *data, struct hid_report *report) { struct device *dev = &data->hdev->dev; struct led_classdev *led; size_t name_sz = strlen(dev_name(dev)) + 8; char *name; int i, ret = 0; if (!report) return -ENODEV; if (report->maxfield != 1 || report->field[0]->report_count != 1 || report->field[0]->report_size != 8) { dev_err(dev, "unsupported LED_STATE report"); return -EINVAL; } for (i = 0; i < 8; i++) { led = kzalloc(sizeof(struct led_classdev)+name_sz, GFP_KERNEL); if (!led) { dev_err(dev, "can't allocate memory for LED %d\n", i); ret = -ENOMEM; goto err; } name = (void *)(&led[1]); snprintf(name, name_sz, "%s::GPO%d", dev_name(dev), i); led->name = name; led->brightness = 0; led->max_brightness = 1; led->brightness_get = picolcd_led_get_brightness; led->brightness_set = picolcd_led_set_brightness; data->led[i] = led; ret = led_classdev_register(dev, data->led[i]); if (ret) { data->led[i] = NULL; kfree(led); dev_err(dev, "can't register LED %d\n", i); goto err; } } return 0; err: for (i = 0; i < 8; i++) if (data->led[i]) { led = data->led[i]; data->led[i] = NULL; led_classdev_unregister(led); kfree(led); } return ret; } void picolcd_exit_leds(struct picolcd_data *data) { struct led_classdev *led; int i; for (i = 0; i < 8; i++) { led = data->led[i]; data->led[i] = NULL; if (!led) continue; led_classdev_unregister(led); kfree(led); } } |
132 57 76 98 34 131 92 92 92 92 132 132 11 5 92 5 92 92 92 148 3 120 27 134 12 92 54 1 1 9 1 28 1 110 4 5 92 111 1 110 16 2 14 40 1 39 6 84 14 425 425 425 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 | // SPDX-License-Identifier: GPL-2.0-only /* * xt_hashlimit - Netfilter module to limit the number of packets per time * separately for each hashbucket (sourceip/sourceport/dstip/dstport) * * (C) 2003-2004 by Harald Welte <laforge@netfilter.org> * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * Copyright © CC Computer Consultants GmbH, 2007 - 2008 * * Development of this code was funded by Astaro AG, http://www.astaro.com/ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/spinlock.h> #include <linux/random.h> #include <linux/jhash.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/list.h> #include <linux/skbuff.h> #include <linux/mm.h> #include <linux/in.h> #include <linux/ip.h> #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) #include <linux/ipv6.h> #include <net/ipv6.h> #endif #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/mutex.h> #include <linux/kernel.h> #include <linux/refcount.h> #include <uapi/linux/netfilter/xt_hashlimit.h> #define XT_HASHLIMIT_ALL (XT_HASHLIMIT_HASH_DIP | XT_HASHLIMIT_HASH_DPT | \ XT_HASHLIMIT_HASH_SIP | XT_HASHLIMIT_HASH_SPT | \ XT_HASHLIMIT_INVERT | XT_HASHLIMIT_BYTES |\ XT_HASHLIMIT_RATE_MATCH) MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); MODULE_DESCRIPTION("Xtables: per hash-bucket rate-limit match"); MODULE_ALIAS("ipt_hashlimit"); MODULE_ALIAS("ip6t_hashlimit"); struct hashlimit_net { struct hlist_head htables; struct proc_dir_entry *ipt_hashlimit; struct proc_dir_entry *ip6t_hashlimit; }; static unsigned int hashlimit_net_id; static inline struct hashlimit_net *hashlimit_pernet(struct net *net) { return net_generic(net, hashlimit_net_id); } /* need to declare this at the top */ static const struct seq_operations dl_seq_ops_v2; static const struct seq_operations dl_seq_ops_v1; static const struct seq_operations dl_seq_ops; /* hash table crap */ struct dsthash_dst { union { struct { __be32 src; __be32 dst; } ip; #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) struct { __be32 src[4]; __be32 dst[4]; } ip6; #endif }; __be16 src_port; __be16 dst_port; }; struct dsthash_ent { /* static / read-only parts in the beginning */ struct hlist_node node; struct dsthash_dst dst; /* modified structure members in the end */ spinlock_t lock; unsigned long expires; /* precalculated expiry time */ struct { unsigned long prev; /* last modification */ union { struct { u_int64_t credit; u_int64_t credit_cap; u_int64_t cost; }; struct { u_int32_t interval, prev_window; u_int64_t current_rate; u_int64_t rate; int64_t burst; }; }; } rateinfo; struct rcu_head rcu; }; struct xt_hashlimit_htable { struct hlist_node node; /* global list of all htables */ refcount_t use; u_int8_t family; bool rnd_initialized; struct hashlimit_cfg3 cfg; /* config */ /* used internally */ spinlock_t lock; /* lock for list_head */ u_int32_t rnd; /* random seed for hash */ unsigned int count; /* number entries in table */ struct delayed_work gc_work; /* seq_file stuff */ struct proc_dir_entry *pde; const char *name; struct net *net; struct hlist_head hash[]; /* hashtable itself */ }; static int cfg_copy(struct hashlimit_cfg3 *to, const void *from, int revision) { if (revision == 1) { struct hashlimit_cfg1 *cfg = (struct hashlimit_cfg1 *)from; to->mode = cfg->mode; to->avg = cfg->avg; to->burst = cfg->burst; to->size = cfg->size; to->max = cfg->max; to->gc_interval = cfg->gc_interval; to->expire = cfg->expire; to->srcmask = cfg->srcmask; to->dstmask = cfg->dstmask; } else if (revision == 2) { struct hashlimit_cfg2 *cfg = (struct hashlimit_cfg2 *)from; to->mode = cfg->mode; to->avg = cfg->avg; to->burst = cfg->burst; to->size = cfg->size; to->max = cfg->max; to->gc_interval = cfg->gc_interval; to->expire = cfg->expire; to->srcmask = cfg->srcmask; to->dstmask = cfg->dstmask; } else if (revision == 3) { memcpy(to, from, sizeof(struct hashlimit_cfg3)); } else { return -EINVAL; } return 0; } static DEFINE_MUTEX(hashlimit_mutex); /* protects htables list */ static struct kmem_cache *hashlimit_cachep __read_mostly; static inline bool dst_cmp(const struct dsthash_ent *ent, const struct dsthash_dst *b) { return !memcmp(&ent->dst, b, sizeof(ent->dst)); } static u_int32_t hash_dst(const struct xt_hashlimit_htable *ht, const struct dsthash_dst *dst) { u_int32_t hash = jhash2((const u32 *)dst, sizeof(*dst)/sizeof(u32), ht->rnd); /* * Instead of returning hash % ht->cfg.size (implying a divide) * we return the high 32 bits of the (hash * ht->cfg.size) that will * give results between [0 and cfg.size-1] and same hash distribution, * but using a multiply, less expensive than a divide */ return reciprocal_scale(hash, ht->cfg.size); } static struct dsthash_ent * dsthash_find(const struct xt_hashlimit_htable *ht, const struct dsthash_dst *dst) { struct dsthash_ent *ent; u_int32_t hash = hash_dst(ht, dst); if (!hlist_empty(&ht->hash[hash])) { hlist_for_each_entry_rcu(ent, &ht->hash[hash], node) if (dst_cmp(ent, dst)) { spin_lock(&ent->lock); return ent; } } return NULL; } /* allocate dsthash_ent, initialize dst, put in htable and lock it */ static struct dsthash_ent * dsthash_alloc_init(struct xt_hashlimit_htable *ht, const struct dsthash_dst *dst, bool *race) { struct dsthash_ent *ent; spin_lock(&ht->lock); /* Two or more packets may race to create the same entry in the * hashtable, double check if this packet lost race. */ ent = dsthash_find(ht, dst); if (ent != NULL) { spin_unlock(&ht->lock); *race = true; return ent; } /* initialize hash with random val at the time we allocate * the first hashtable entry */ if (unlikely(!ht->rnd_initialized)) { get_random_bytes(&ht->rnd, sizeof(ht->rnd)); ht->rnd_initialized = true; } if (ht->cfg.max && ht->count >= ht->cfg.max) { /* FIXME: do something. question is what.. */ net_err_ratelimited("max count of %u reached\n", ht->cfg.max); ent = NULL; } else ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC); if (ent) { memcpy(&ent->dst, dst, sizeof(ent->dst)); spin_lock_init(&ent->lock); spin_lock(&ent->lock); hlist_add_head_rcu(&ent->node, &ht->hash[hash_dst(ht, dst)]); ht->count++; } spin_unlock(&ht->lock); return ent; } static void dsthash_free_rcu(struct rcu_head *head) { struct dsthash_ent *ent = container_of(head, struct dsthash_ent, rcu); kmem_cache_free(hashlimit_cachep, ent); } static inline void dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent) { hlist_del_rcu(&ent->node); call_rcu(&ent->rcu, dsthash_free_rcu); ht->count--; } static void htable_gc(struct work_struct *work); static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, const char *name, u_int8_t family, struct xt_hashlimit_htable **out_hinfo, int revision) { struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); struct xt_hashlimit_htable *hinfo; const struct seq_operations *ops; unsigned int size, i; unsigned long nr_pages = totalram_pages(); int ret; if (cfg->size) { size = cfg->size; } else { size = (nr_pages << PAGE_SHIFT) / 16384 / sizeof(struct hlist_head); if (nr_pages > 1024 * 1024 * 1024 / PAGE_SIZE) size = 8192; if (size < 16) size = 16; } /* FIXME: don't use vmalloc() here or anywhere else -HW */ hinfo = vmalloc(struct_size(hinfo, hash, size)); if (hinfo == NULL) return -ENOMEM; *out_hinfo = hinfo; /* copy match config into hashtable config */ ret = cfg_copy(&hinfo->cfg, (void *)cfg, 3); if (ret) { vfree(hinfo); return ret; } hinfo->cfg.size = size; if (hinfo->cfg.max == 0) hinfo->cfg.max = 8 * hinfo->cfg.size; else if (hinfo->cfg.max < hinfo->cfg.size) hinfo->cfg.max = hinfo->cfg.size; for (i = 0; i < hinfo->cfg.size; i++) INIT_HLIST_HEAD(&hinfo->hash[i]); refcount_set(&hinfo->use, 1); hinfo->count = 0; hinfo->family = family; hinfo->rnd_initialized = false; hinfo->name = kstrdup(name, GFP_KERNEL); if (!hinfo->name) { vfree(hinfo); return -ENOMEM; } spin_lock_init(&hinfo->lock); switch (revision) { case 1: ops = &dl_seq_ops_v1; break; case 2: ops = &dl_seq_ops_v2; break; default: ops = &dl_seq_ops; } hinfo->pde = proc_create_seq_data(name, 0, (family == NFPROTO_IPV4) ? hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit, ops, hinfo); if (hinfo->pde == NULL) { kfree(hinfo->name); vfree(hinfo); return -ENOMEM; } hinfo->net = net; INIT_DEFERRABLE_WORK(&hinfo->gc_work, htable_gc); queue_delayed_work(system_power_efficient_wq, &hinfo->gc_work, msecs_to_jiffies(hinfo->cfg.gc_interval)); hlist_add_head(&hinfo->node, &hashlimit_net->htables); return 0; } static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, bool select_all) { unsigned int i; for (i = 0; i < ht->cfg.size; i++) { struct dsthash_ent *dh; struct hlist_node *n; spin_lock_bh(&ht->lock); hlist_for_each_entry_safe(dh, n, &ht->hash[i], node) { if (time_after_eq(jiffies, dh->expires) || select_all) dsthash_free(ht, dh); } spin_unlock_bh(&ht->lock); cond_resched(); } } static void htable_gc(struct work_struct *work) { struct xt_hashlimit_htable *ht; ht = container_of(work, struct xt_hashlimit_htable, gc_work.work); htable_selective_cleanup(ht, false); queue_delayed_work(system_power_efficient_wq, &ht->gc_work, msecs_to_jiffies(ht->cfg.gc_interval)); } static void htable_remove_proc_entry(struct xt_hashlimit_htable *hinfo) { struct hashlimit_net *hashlimit_net = hashlimit_pernet(hinfo->net); struct proc_dir_entry *parent; if (hinfo->family == NFPROTO_IPV4) parent = hashlimit_net->ipt_hashlimit; else parent = hashlimit_net->ip6t_hashlimit; if (parent != NULL) remove_proc_entry(hinfo->name, parent); } static struct xt_hashlimit_htable *htable_find_get(struct net *net, const char *name, u_int8_t family) { struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); struct xt_hashlimit_htable *hinfo; hlist_for_each_entry(hinfo, &hashlimit_net->htables, node) { if (!strcmp(name, hinfo->name) && hinfo->family == family) { refcount_inc(&hinfo->use); return hinfo; } } return NULL; } static void htable_put(struct xt_hashlimit_htable *hinfo) { if (refcount_dec_and_mutex_lock(&hinfo->use, &hashlimit_mutex)) { hlist_del(&hinfo->node); htable_remove_proc_entry(hinfo); mutex_unlock(&hashlimit_mutex); cancel_delayed_work_sync(&hinfo->gc_work); htable_selective_cleanup(hinfo, true); kfree(hinfo->name); vfree(hinfo); } } /* The algorithm used is the Simple Token Bucket Filter (TBF) * see net/sched/sch_tbf.c in the linux source tree */ /* Rusty: This is my (non-mathematically-inclined) understanding of this algorithm. The `average rate' in jiffies becomes your initial amount of credit `credit' and the most credit you can ever have `credit_cap'. The `peak rate' becomes the cost of passing the test, `cost'. `prev' tracks the last packet hit: you gain one credit per jiffy. If you get credit balance more than this, the extra credit is discarded. Every time the match passes, you lose `cost' credits; if you don't have that many, the test fails. See Alexey's formal explanation in net/sched/sch_tbf.c. To get the maximum range, we multiply by this factor (ie. you get N credits per jiffy). We want to allow a rate as low as 1 per day (slowest userspace tool allows), which means CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie. */ #define MAX_CPJ_v1 (0xFFFFFFFF / (HZ*60*60*24)) #define MAX_CPJ (0xFFFFFFFFFFFFFFFFULL / (HZ*60*60*24)) /* Repeated shift and or gives us all 1s, final shift and add 1 gives * us the power of 2 below the theoretical max, so GCC simply does a * shift. */ #define _POW2_BELOW2(x) ((x)|((x)>>1)) #define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2)) #define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4)) #define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8)) #define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16)) #define _POW2_BELOW64(x) (_POW2_BELOW32(x)|_POW2_BELOW32((x)>>32)) #define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1) #define POW2_BELOW64(x) ((_POW2_BELOW64(x)>>1) + 1) #define CREDITS_PER_JIFFY POW2_BELOW64(MAX_CPJ) #define CREDITS_PER_JIFFY_v1 POW2_BELOW32(MAX_CPJ_v1) /* in byte mode, the lowest possible rate is one packet/second. * credit_cap is used as a counter that tells us how many times we can * refill the "credits available" counter when it becomes empty. */ #define MAX_CPJ_BYTES (0xFFFFFFFF / HZ) #define CREDITS_PER_JIFFY_BYTES POW2_BELOW32(MAX_CPJ_BYTES) static u32 xt_hashlimit_len_to_chunks(u32 len) { return (len >> XT_HASHLIMIT_BYTE_SHIFT) + 1; } /* Precision saver. */ static u64 user2credits(u64 user, int revision) { u64 scale = (revision == 1) ? XT_HASHLIMIT_SCALE : XT_HASHLIMIT_SCALE_v2; u64 cpj = (revision == 1) ? CREDITS_PER_JIFFY_v1 : CREDITS_PER_JIFFY; /* Avoid overflow: divide the constant operands first */ if (scale >= HZ * cpj) return div64_u64(user, div64_u64(scale, HZ * cpj)); return user * div64_u64(HZ * cpj, scale); } static u32 user2credits_byte(u32 user) { u64 us = user; us *= HZ * CREDITS_PER_JIFFY_BYTES; return (u32) (us >> 32); } static u64 user2rate(u64 user) { if (user != 0) { return div64_u64(XT_HASHLIMIT_SCALE_v2, user); } else { pr_info_ratelimited("invalid rate from userspace: %llu\n", user); return 0; } } static u64 user2rate_bytes(u32 user) { u64 r; r = user ? U32_MAX / user : U32_MAX; return (r - 1) << XT_HASHLIMIT_BYTE_SHIFT; } static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode, int revision) { unsigned long delta = now - dh->rateinfo.prev; u64 cap, cpj; if (delta == 0) return; if (revision >= 3 && mode & XT_HASHLIMIT_RATE_MATCH) { u64 interval = dh->rateinfo.interval * HZ; if (delta < interval) return; dh->rateinfo.prev = now; dh->rateinfo.prev_window = ((dh->rateinfo.current_rate * interval) > (delta * dh->rateinfo.rate)); dh->rateinfo.current_rate = 0; return; } dh->rateinfo.prev = now; if (mode & XT_HASHLIMIT_BYTES) { u64 tmp = dh->rateinfo.credit; dh->rateinfo.credit += CREDITS_PER_JIFFY_BYTES * delta; cap = CREDITS_PER_JIFFY_BYTES * HZ; if (tmp >= dh->rateinfo.credit) {/* overflow */ dh->rateinfo.credit = cap; return; } } else { cpj = (revision == 1) ? CREDITS_PER_JIFFY_v1 : CREDITS_PER_JIFFY; dh->rateinfo.credit += delta * cpj; cap = dh->rateinfo.credit_cap; } if (dh->rateinfo.credit > cap) dh->rateinfo.credit = cap; } static void rateinfo_init(struct dsthash_ent *dh, struct xt_hashlimit_htable *hinfo, int revision) { dh->rateinfo.prev = jiffies; if (revision >= 3 && hinfo->cfg.mode & XT_HASHLIMIT_RATE_MATCH) { dh->rateinfo.prev_window = 0; dh->rateinfo.current_rate = 0; if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) { dh->rateinfo.rate = user2rate_bytes((u32)hinfo->cfg.avg); if (hinfo->cfg.burst) dh->rateinfo.burst = hinfo->cfg.burst * dh->rateinfo.rate; else dh->rateinfo.burst = dh->rateinfo.rate; } else { dh->rateinfo.rate = user2rate(hinfo->cfg.avg); dh->rateinfo.burst = hinfo->cfg.burst + dh->rateinfo.rate; } dh->rateinfo.interval = hinfo->cfg.interval; } else if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) { dh->rateinfo.credit = CREDITS_PER_JIFFY_BYTES * HZ; dh->rateinfo.cost = user2credits_byte(hinfo->cfg.avg); dh->rateinfo.credit_cap = hinfo->cfg.burst; } else { dh->rateinfo.credit = user2credits(hinfo->cfg.avg * hinfo->cfg.burst, revision); dh->rateinfo.cost = user2credits(hinfo->cfg.avg, revision); dh->rateinfo.credit_cap = dh->rateinfo.credit; } } static inline __be32 maskl(__be32 a, unsigned int l) { return l ? htonl(ntohl(a) & ~0 << (32 - l)) : 0; } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static void hashlimit_ipv6_mask(__be32 *i, unsigned int p) { switch (p) { case 0 ... 31: i[0] = maskl(i[0], p); i[1] = i[2] = i[3] = 0; break; case 32 ... 63: i[1] = maskl(i[1], p - 32); i[2] = i[3] = 0; break; case 64 ... 95: i[2] = maskl(i[2], p - 64); i[3] = 0; break; case 96 ... 127: i[3] = maskl(i[3], p - 96); break; case 128: break; } } #endif static int hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, struct dsthash_dst *dst, const struct sk_buff *skb, unsigned int protoff) { __be16 _ports[2], *ports; u8 nexthdr; int poff; memset(dst, 0, sizeof(*dst)); switch (hinfo->family) { case NFPROTO_IPV4: if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DIP) dst->ip.dst = maskl(ip_hdr(skb)->daddr, hinfo->cfg.dstmask); if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_SIP) dst->ip.src = maskl(ip_hdr(skb)->saddr, hinfo->cfg.srcmask); if (!(hinfo->cfg.mode & (XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT))) return 0; nexthdr = ip_hdr(skb)->protocol; break; #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) case NFPROTO_IPV6: { __be16 frag_off; if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DIP) { memcpy(&dst->ip6.dst, &ipv6_hdr(skb)->daddr, sizeof(dst->ip6.dst)); hashlimit_ipv6_mask(dst->ip6.dst, hinfo->cfg.dstmask); } if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_SIP) { memcpy(&dst->ip6.src, &ipv6_hdr(skb)->saddr, sizeof(dst->ip6.src)); hashlimit_ipv6_mask(dst->ip6.src, hinfo->cfg.srcmask); } if (!(hinfo->cfg.mode & (XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT))) return 0; nexthdr = ipv6_hdr(skb)->nexthdr; protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); if ((int)protoff < 0) return -1; break; } #endif default: BUG(); return 0; } poff = proto_ports_offset(nexthdr); if (poff >= 0) { ports = skb_header_pointer(skb, protoff + poff, sizeof(_ports), &_ports); } else { _ports[0] = _ports[1] = 0; ports = _ports; } if (!ports) return -1; if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_SPT) dst->src_port = ports[0]; if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DPT) dst->dst_port = ports[1]; return 0; } static u32 hashlimit_byte_cost(unsigned int len, struct dsthash_ent *dh) { u64 tmp = xt_hashlimit_len_to_chunks(len); tmp = tmp * dh->rateinfo.cost; if (unlikely(tmp > CREDITS_PER_JIFFY_BYTES * HZ)) tmp = CREDITS_PER_JIFFY_BYTES * HZ; if (dh->rateinfo.credit < tmp && dh->rateinfo.credit_cap) { dh->rateinfo.credit_cap--; dh->rateinfo.credit = CREDITS_PER_JIFFY_BYTES * HZ; } return (u32) tmp; } static bool hashlimit_mt_common(const struct sk_buff *skb, struct xt_action_param *par, struct xt_hashlimit_htable *hinfo, const struct hashlimit_cfg3 *cfg, int revision) { unsigned long now = jiffies; struct dsthash_ent *dh; struct dsthash_dst dst; bool race = false; u64 cost; if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0) goto hotdrop; local_bh_disable(); dh = dsthash_find(hinfo, &dst); if (dh == NULL) { dh = dsthash_alloc_init(hinfo, &dst, &race); if (dh == NULL) { local_bh_enable(); goto hotdrop; } else if (race) { /* Already got an entry, update expiration timeout */ dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire); rateinfo_recalc(dh, now, hinfo->cfg.mode, revision); } else { dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire); rateinfo_init(dh, hinfo, revision); } } else { /* update expiration timeout */ dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire); rateinfo_recalc(dh, now, hinfo->cfg.mode, revision); } if (cfg->mode & XT_HASHLIMIT_RATE_MATCH) { cost = (cfg->mode & XT_HASHLIMIT_BYTES) ? skb->len : 1; dh->rateinfo.current_rate += cost; if (!dh->rateinfo.prev_window && (dh->rateinfo.current_rate <= dh->rateinfo.burst)) { spin_unlock(&dh->lock); local_bh_enable(); return !(cfg->mode & XT_HASHLIMIT_INVERT); } else { goto overlimit; } } if (cfg->mode & XT_HASHLIMIT_BYTES) cost = hashlimit_byte_cost(skb->len, dh); else cost = dh->rateinfo.cost; if (dh->rateinfo.credit >= cost) { /* below the limit */ dh->rateinfo.credit -= cost; spin_unlock(&dh->lock); local_bh_enable(); return !(cfg->mode & XT_HASHLIMIT_INVERT); } overlimit: spin_unlock(&dh->lock); local_bh_enable(); /* default match is underlimit - so over the limit, we need to invert */ return cfg->mode & XT_HASHLIMIT_INVERT; hotdrop: par->hotdrop = true; return false; } static bool hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_hashlimit_mtinfo1 *info = par->matchinfo; struct xt_hashlimit_htable *hinfo = info->hinfo; struct hashlimit_cfg3 cfg = {}; int ret; ret = cfg_copy(&cfg, (void *)&info->cfg, 1); if (ret) return ret; return hashlimit_mt_common(skb, par, hinfo, &cfg, 1); } static bool hashlimit_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_hashlimit_mtinfo2 *info = par->matchinfo; struct xt_hashlimit_htable *hinfo = info->hinfo; struct hashlimit_cfg3 cfg = {}; int ret; ret = cfg_copy(&cfg, (void *)&info->cfg, 2); if (ret) return ret; return hashlimit_mt_common(skb, par, hinfo, &cfg, 2); } static bool hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_hashlimit_mtinfo3 *info = par->matchinfo; struct xt_hashlimit_htable *hinfo = info->hinfo; return hashlimit_mt_common(skb, par, hinfo, &info->cfg, 3); } #define HASHLIMIT_MAX_SIZE 1048576 static int hashlimit_mt_check_common(const struct xt_mtchk_param *par, struct xt_hashlimit_htable **hinfo, struct hashlimit_cfg3 *cfg, const char *name, int revision) { struct net *net = par->net; int ret; if (cfg->gc_interval == 0 || cfg->expire == 0) return -EINVAL; if (cfg->size > HASHLIMIT_MAX_SIZE) { cfg->size = HASHLIMIT_MAX_SIZE; pr_info_ratelimited("size too large, truncated to %u\n", cfg->size); } if (cfg->max > HASHLIMIT_MAX_SIZE) { cfg->max = HASHLIMIT_MAX_SIZE; pr_info_ratelimited("max too large, truncated to %u\n", cfg->max); } if (par->family == NFPROTO_IPV4) { if (cfg->srcmask > 32 || cfg->dstmask > 32) return -EINVAL; } else { if (cfg->srcmask > 128 || cfg->dstmask > 128) return -EINVAL; } if (cfg->mode & ~XT_HASHLIMIT_ALL) { pr_info_ratelimited("Unknown mode mask %X, kernel too old?\n", cfg->mode); return -EINVAL; } /* Check for overflow. */ if (revision >= 3 && cfg->mode & XT_HASHLIMIT_RATE_MATCH) { if (cfg->avg == 0 || cfg->avg > U32_MAX) { pr_info_ratelimited("invalid rate\n"); return -ERANGE; } if (cfg->interval == 0) { pr_info_ratelimited("invalid interval\n"); return -EINVAL; } } else if (cfg->mode & XT_HASHLIMIT_BYTES) { if (user2credits_byte(cfg->avg) == 0) { pr_info_ratelimited("overflow, rate too high: %llu\n", cfg->avg); return -EINVAL; } } else if (cfg->burst == 0 || user2credits(cfg->avg * cfg->burst, revision) < user2credits(cfg->avg, revision)) { pr_info_ratelimited("overflow, try lower: %llu/%llu\n", cfg->avg, cfg->burst); return -ERANGE; } mutex_lock(&hashlimit_mutex); *hinfo = htable_find_get(net, name, par->family); if (*hinfo == NULL) { ret = htable_create(net, cfg, name, par->family, hinfo, revision); if (ret < 0) { mutex_unlock(&hashlimit_mutex); return ret; } } mutex_unlock(&hashlimit_mutex); return 0; } static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par) { struct xt_hashlimit_mtinfo1 *info = par->matchinfo; struct hashlimit_cfg3 cfg = {}; int ret; ret = xt_check_proc_name(info->name, sizeof(info->name)); if (ret) return ret; ret = cfg_copy(&cfg, (void *)&info->cfg, 1); if (ret) return ret; return hashlimit_mt_check_common(par, &info->hinfo, &cfg, info->name, 1); } static int hashlimit_mt_check_v2(const struct xt_mtchk_param *par) { struct xt_hashlimit_mtinfo2 *info = par->matchinfo; struct hashlimit_cfg3 cfg = {}; int ret; ret = xt_check_proc_name(info->name, sizeof(info->name)); if (ret) return ret; ret = cfg_copy(&cfg, (void *)&info->cfg, 2); if (ret) return ret; return hashlimit_mt_check_common(par, &info->hinfo, &cfg, info->name, 2); } static int hashlimit_mt_check(const struct xt_mtchk_param *par) { struct xt_hashlimit_mtinfo3 *info = par->matchinfo; int ret; ret = xt_check_proc_name(info->name, sizeof(info->name)); if (ret) return ret; return hashlimit_mt_check_common(par, &info->hinfo, &info->cfg, info->name, 3); } static void hashlimit_mt_destroy_v2(const struct xt_mtdtor_param *par) { const struct xt_hashlimit_mtinfo2 *info = par->matchinfo; htable_put(info->hinfo); } static void hashlimit_mt_destroy_v1(const struct xt_mtdtor_param *par) { const struct xt_hashlimit_mtinfo1 *info = par->matchinfo; htable_put(info->hinfo); } static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par) { const struct xt_hashlimit_mtinfo3 *info = par->matchinfo; htable_put(info->hinfo); } static struct xt_match hashlimit_mt_reg[] __read_mostly = { { .name = "hashlimit", .revision = 1, .family = NFPROTO_IPV4, .match = hashlimit_mt_v1, .matchsize = sizeof(struct xt_hashlimit_mtinfo1), .usersize = offsetof(struct xt_hashlimit_mtinfo1, hinfo), .checkentry = hashlimit_mt_check_v1, .destroy = hashlimit_mt_destroy_v1, .me = THIS_MODULE, }, { .name = "hashlimit", .revision = 2, .family = NFPROTO_IPV4, .match = hashlimit_mt_v2, .matchsize = sizeof(struct xt_hashlimit_mtinfo2), .usersize = offsetof(struct xt_hashlimit_mtinfo2, hinfo), .checkentry = hashlimit_mt_check_v2, .destroy = hashlimit_mt_destroy_v2, .me = THIS_MODULE, }, { .name = "hashlimit", .revision = 3, .family = NFPROTO_IPV4, .match = hashlimit_mt, .matchsize = sizeof(struct xt_hashlimit_mtinfo3), .usersize = offsetof(struct xt_hashlimit_mtinfo3, hinfo), .checkentry = hashlimit_mt_check, .destroy = hashlimit_mt_destroy, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "hashlimit", .revision = 1, .family = NFPROTO_IPV6, .match = hashlimit_mt_v1, .matchsize = sizeof(struct xt_hashlimit_mtinfo1), .usersize = offsetof(struct xt_hashlimit_mtinfo1, hinfo), .checkentry = hashlimit_mt_check_v1, .destroy = hashlimit_mt_destroy_v1, .me = THIS_MODULE, }, { .name = "hashlimit", .revision = 2, .family = NFPROTO_IPV6, .match = hashlimit_mt_v2, .matchsize = sizeof(struct xt_hashlimit_mtinfo2), .usersize = offsetof(struct xt_hashlimit_mtinfo2, hinfo), .checkentry = hashlimit_mt_check_v2, .destroy = hashlimit_mt_destroy_v2, .me = THIS_MODULE, }, { .name = "hashlimit", .revision = 3, .family = NFPROTO_IPV6, .match = hashlimit_mt, .matchsize = sizeof(struct xt_hashlimit_mtinfo3), .usersize = offsetof(struct xt_hashlimit_mtinfo3, hinfo), .checkentry = hashlimit_mt_check, .destroy = hashlimit_mt_destroy, .me = THIS_MODULE, }, #endif }; /* PROC stuff */ static void *dl_seq_start(struct seq_file *s, loff_t *pos) __acquires(htable->lock) { struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket; spin_lock_bh(&htable->lock); if (*pos >= htable->cfg.size) return NULL; bucket = kmalloc(sizeof(unsigned int), GFP_ATOMIC); if (!bucket) return ERR_PTR(-ENOMEM); *bucket = *pos; return bucket; } static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos) { struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket = v; *pos = ++(*bucket); if (*pos >= htable->cfg.size) { kfree(v); return NULL; } return bucket; } static void dl_seq_stop(struct seq_file *s, void *v) __releases(htable->lock) { struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket = v; if (!IS_ERR(bucket)) kfree(bucket); spin_unlock_bh(&htable->lock); } static void dl_seq_print(struct dsthash_ent *ent, u_int8_t family, struct seq_file *s) { switch (family) { case NFPROTO_IPV4: seq_printf(s, "%ld %pI4:%u->%pI4:%u %llu %llu %llu\n", (long)(ent->expires - jiffies)/HZ, &ent->dst.ip.src, ntohs(ent->dst.src_port), &ent->dst.ip.dst, ntohs(ent->dst.dst_port), ent->rateinfo.credit, ent->rateinfo.credit_cap, ent->rateinfo.cost); break; #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) case NFPROTO_IPV6: seq_printf(s, "%ld %pI6:%u->%pI6:%u %llu %llu %llu\n", (long)(ent->expires - jiffies)/HZ, &ent->dst.ip6.src, ntohs(ent->dst.src_port), &ent->dst.ip6.dst, ntohs(ent->dst.dst_port), ent->rateinfo.credit, ent->rateinfo.credit_cap, ent->rateinfo.cost); break; #endif default: BUG(); } } static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family, struct seq_file *s) { struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file)); spin_lock(&ent->lock); /* recalculate to show accurate numbers */ rateinfo_recalc(ent, jiffies, ht->cfg.mode, 2); dl_seq_print(ent, family, s); spin_unlock(&ent->lock); return seq_has_overflowed(s); } static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family, struct seq_file *s) { struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file)); spin_lock(&ent->lock); /* recalculate to show accurate numbers */ rateinfo_recalc(ent, jiffies, ht->cfg.mode, 1); dl_seq_print(ent, family, s); spin_unlock(&ent->lock); return seq_has_overflowed(s); } static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, struct seq_file *s) { struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file)); spin_lock(&ent->lock); /* recalculate to show accurate numbers */ rateinfo_recalc(ent, jiffies, ht->cfg.mode, 3); dl_seq_print(ent, family, s); spin_unlock(&ent->lock); return seq_has_overflowed(s); } static int dl_seq_show_v2(struct seq_file *s, void *v) { struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket = (unsigned int *)v; struct dsthash_ent *ent; if (!hlist_empty(&htable->hash[*bucket])) { hlist_for_each_entry(ent, &htable->hash[*bucket], node) if (dl_seq_real_show_v2(ent, htable->family, s)) return -1; } return 0; } static int dl_seq_show_v1(struct seq_file *s, void *v) { struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket = v; struct dsthash_ent *ent; if (!hlist_empty(&htable->hash[*bucket])) { hlist_for_each_entry(ent, &htable->hash[*bucket], node) if (dl_seq_real_show_v1(ent, htable->family, s)) return -1; } return 0; } static int dl_seq_show(struct seq_file *s, void *v) { struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket = v; struct dsthash_ent *ent; if (!hlist_empty(&htable->hash[*bucket])) { hlist_for_each_entry(ent, &htable->hash[*bucket], node) if (dl_seq_real_show(ent, htable->family, s)) return -1; } return 0; } static const struct seq_operations dl_seq_ops_v1 = { .start = dl_seq_start, .next = dl_seq_next, .stop = dl_seq_stop, .show = dl_seq_show_v1 }; static const struct seq_operations dl_seq_ops_v2 = { .start = dl_seq_start, .next = dl_seq_next, .stop = dl_seq_stop, .show = dl_seq_show_v2 }; static const struct seq_operations dl_seq_ops = { .start = dl_seq_start, .next = dl_seq_next, .stop = dl_seq_stop, .show = dl_seq_show }; static int __net_init hashlimit_proc_net_init(struct net *net) { struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); hashlimit_net->ipt_hashlimit = proc_mkdir("ipt_hashlimit", net->proc_net); if (!hashlimit_net->ipt_hashlimit) return -ENOMEM; #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) hashlimit_net->ip6t_hashlimit = proc_mkdir("ip6t_hashlimit", net->proc_net); if (!hashlimit_net->ip6t_hashlimit) { remove_proc_entry("ipt_hashlimit", net->proc_net); return -ENOMEM; } #endif return 0; } static void __net_exit hashlimit_proc_net_exit(struct net *net) { struct xt_hashlimit_htable *hinfo; struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); /* hashlimit_net_exit() is called before hashlimit_mt_destroy(). * Make sure that the parent ipt_hashlimit and ip6t_hashlimit proc * entries is empty before trying to remove it. */ mutex_lock(&hashlimit_mutex); hlist_for_each_entry(hinfo, &hashlimit_net->htables, node) htable_remove_proc_entry(hinfo); hashlimit_net->ipt_hashlimit = NULL; hashlimit_net->ip6t_hashlimit = NULL; mutex_unlock(&hashlimit_mutex); remove_proc_entry("ipt_hashlimit", net->proc_net); #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) remove_proc_entry("ip6t_hashlimit", net->proc_net); #endif } static int __net_init hashlimit_net_init(struct net *net) { struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); INIT_HLIST_HEAD(&hashlimit_net->htables); return hashlimit_proc_net_init(net); } static void __net_exit hashlimit_net_exit(struct net *net) { hashlimit_proc_net_exit(net); } static struct pernet_operations hashlimit_net_ops = { .init = hashlimit_net_init, .exit = hashlimit_net_exit, .id = &hashlimit_net_id, .size = sizeof(struct hashlimit_net), }; static int __init hashlimit_mt_init(void) { int err; err = register_pernet_subsys(&hashlimit_net_ops); if (err < 0) return err; err = xt_register_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg)); if (err < 0) goto err1; err = -ENOMEM; hashlimit_cachep = kmem_cache_create("xt_hashlimit", sizeof(struct dsthash_ent), 0, 0, NULL); if (!hashlimit_cachep) { pr_warn("unable to create slab cache\n"); goto err2; } return 0; err2: xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg)); err1: unregister_pernet_subsys(&hashlimit_net_ops); return err; } static void __exit hashlimit_mt_exit(void) { xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg)); unregister_pernet_subsys(&hashlimit_net_ops); rcu_barrier(); kmem_cache_destroy(hashlimit_cachep); } module_init(hashlimit_mt_init); module_exit(hashlimit_mt_exit); |
3 579 647 424 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TTY_H #define _LINUX_TTY_H #include <linux/fs.h> #include <linux/major.h> #include <linux/termios.h> #include <linux/workqueue.h> #include <linux/tty_buffer.h> #include <linux/tty_driver.h> #include <linux/tty_ldisc.h> #include <linux/tty_port.h> #include <linux/mutex.h> #include <linux/tty_flags.h> #include <uapi/linux/tty.h> #include <linux/rwsem.h> #include <linux/llist.h> #include <linux/android_kabi.h> /* * (Note: the *_driver.minor_start values 1, 64, 128, 192 are * hardcoded at present.) */ #define NR_UNIX98_PTY_DEFAULT 4096 /* Default maximum for Unix98 ptys */ #define NR_UNIX98_PTY_RESERVE 1024 /* Default reserve for main devpts */ #define NR_UNIX98_PTY_MAX (1 << MINORBITS) /* Absolute limit */ /* * This character is the same as _POSIX_VDISABLE: it cannot be used as * a c_cc[] character, but indicates that a particular special character * isn't in use (eg VINTR has no character etc) */ #define __DISABLED_CHAR '\0' #define INTR_CHAR(tty) ((tty)->termios.c_cc[VINTR]) #define QUIT_CHAR(tty) ((tty)->termios.c_cc[VQUIT]) #define ERASE_CHAR(tty) ((tty)->termios.c_cc[VERASE]) #define KILL_CHAR(tty) ((tty)->termios.c_cc[VKILL]) #define EOF_CHAR(tty) ((tty)->termios.c_cc[VEOF]) #define TIME_CHAR(tty) ((tty)->termios.c_cc[VTIME]) #define MIN_CHAR(tty) ((tty)->termios.c_cc[VMIN]) #define SWTC_CHAR(tty) ((tty)->termios.c_cc[VSWTC]) #define START_CHAR(tty) ((tty)->termios.c_cc[VSTART]) #define STOP_CHAR(tty) ((tty)->termios.c_cc[VSTOP]) #define SUSP_CHAR(tty) ((tty)->termios.c_cc[VSUSP]) #define EOL_CHAR(tty) ((tty)->termios.c_cc[VEOL]) #define REPRINT_CHAR(tty) ((tty)->termios.c_cc[VREPRINT]) #define DISCARD_CHAR(tty) ((tty)->termios.c_cc[VDISCARD]) #define WERASE_CHAR(tty) ((tty)->termios.c_cc[VWERASE]) #define LNEXT_CHAR(tty) ((tty)->termios.c_cc[VLNEXT]) #define EOL2_CHAR(tty) ((tty)->termios.c_cc[VEOL2]) #define _I_FLAG(tty, f) ((tty)->termios.c_iflag & (f)) #define _O_FLAG(tty, f) ((tty)->termios.c_oflag & (f)) #define _C_FLAG(tty, f) ((tty)->termios.c_cflag & (f)) #define _L_FLAG(tty, f) ((tty)->termios.c_lflag & (f)) #define I_IGNBRK(tty) _I_FLAG((tty), IGNBRK) #define I_BRKINT(tty) _I_FLAG((tty), BRKINT) #define I_IGNPAR(tty) _I_FLAG((tty), IGNPAR) #define I_PARMRK(tty) _I_FLAG((tty), PARMRK) #define I_INPCK(tty) _I_FLAG((tty), INPCK) #define I_ISTRIP(tty) _I_FLAG((tty), ISTRIP) #define I_INLCR(tty) _I_FLAG((tty), INLCR) #define I_IGNCR(tty) _I_FLAG((tty), IGNCR) #define I_ICRNL(tty) _I_FLAG((tty), ICRNL) #define I_IUCLC(tty) _I_FLAG((tty), IUCLC) #define I_IXON(tty) _I_FLAG((tty), IXON) #define I_IXANY(tty) _I_FLAG((tty), IXANY) #define I_IXOFF(tty) _I_FLAG((tty), IXOFF) #define I_IMAXBEL(tty) _I_FLAG((tty), IMAXBEL) #define I_IUTF8(tty) _I_FLAG((tty), IUTF8) #define O_OPOST(tty) _O_FLAG((tty), OPOST) #define O_OLCUC(tty) _O_FLAG((tty), OLCUC) #define O_ONLCR(tty) _O_FLAG((tty), ONLCR) #define O_OCRNL(tty) _O_FLAG((tty), OCRNL) #define O_ONOCR(tty) _O_FLAG((tty), ONOCR) #define O_ONLRET(tty) _O_FLAG((tty), ONLRET) #define O_OFILL(tty) _O_FLAG((tty), OFILL) #define O_OFDEL(tty) _O_FLAG((tty), OFDEL) #define O_NLDLY(tty) _O_FLAG((tty), NLDLY) #define O_CRDLY(tty) _O_FLAG((tty), CRDLY) #define O_TABDLY(tty) _O_FLAG((tty), TABDLY) #define O_BSDLY(tty) _O_FLAG((tty), BSDLY) #define O_VTDLY(tty) _O_FLAG((tty), VTDLY) #define O_FFDLY(tty) _O_FLAG((tty), FFDLY) #define C_BAUD(tty) _C_FLAG((tty), CBAUD) #define C_CSIZE(tty) _C_FLAG((tty), CSIZE) #define C_CSTOPB(tty) _C_FLAG((tty), CSTOPB) #define C_CREAD(tty) _C_FLAG((tty), CREAD) #define C_PARENB(tty) _C_FLAG((tty), PARENB) #define C_PARODD(tty) _C_FLAG((tty), PARODD) #define C_HUPCL(tty) _C_FLAG((tty), HUPCL) #define C_CLOCAL(tty) _C_FLAG((tty), CLOCAL) #define C_CIBAUD(tty) _C_FLAG((tty), CIBAUD) #define C_CRTSCTS(tty) _C_FLAG((tty), CRTSCTS) #define C_CMSPAR(tty) _C_FLAG((tty), CMSPAR) #define L_ISIG(tty) _L_FLAG((tty), ISIG) #define L_ICANON(tty) _L_FLAG((tty), ICANON) #define L_XCASE(tty) _L_FLAG((tty), XCASE) #define L_ECHO(tty) _L_FLAG((tty), ECHO) #define L_ECHOE(tty) _L_FLAG((tty), ECHOE) #define L_ECHOK(tty) _L_FLAG((tty), ECHOK) #define L_ECHONL(tty) _L_FLAG((tty), ECHONL) #define L_NOFLSH(tty) _L_FLAG((tty), NOFLSH) #define L_TOSTOP(tty) _L_FLAG((tty), TOSTOP) #define L_ECHOCTL(tty) _L_FLAG((tty), ECHOCTL) #define L_ECHOPRT(tty) _L_FLAG((tty), ECHOPRT) #define L_ECHOKE(tty) _L_FLAG((tty), ECHOKE) #define L_FLUSHO(tty) _L_FLAG((tty), FLUSHO) #define L_PENDIN(tty) _L_FLAG((tty), PENDIN) #define L_IEXTEN(tty) _L_FLAG((tty), IEXTEN) #define L_EXTPROC(tty) _L_FLAG((tty), EXTPROC) struct device; struct signal_struct; struct tty_operations; /** * struct tty_struct - state associated with a tty while open * * @kref: reference counting by tty_kref_get() and tty_kref_put(), reaching zero * frees the structure * @dev: class device or %NULL (e.g. ptys, serdev) * @driver: &struct tty_driver operating this tty * @ops: &struct tty_operations of @driver for this tty (open, close, etc.) * @index: index of this tty (e.g. to construct @name like tty12) * @ldisc_sem: protects line discipline changes (@ldisc) -- lock tty not pty * @ldisc: the current line discipline for this tty (n_tty by default) * @atomic_write_lock: protects against concurrent writers, i.e. locks * @write_cnt, @write_buf and similar * @legacy_mutex: leftover from history (BKL -> BTM -> @legacy_mutex), * protecting several operations on this tty * @throttle_mutex: protects against concurrent tty_throttle_safe() and * tty_unthrottle_safe() (but not tty_unthrottle()) * @termios_rwsem: protects @termios and @termios_locked * @winsize_mutex: protects @winsize * @termios: termios for the current tty, copied from/to @driver.termios * @termios_locked: locked termios (by %TIOCGLCKTRMIOS and %TIOCSLCKTRMIOS * ioctls) * @name: name of the tty constructed by tty_line_name() (e.g. ttyS3) * @flags: bitwise OR of %TTY_THROTTLED, %TTY_IO_ERROR, ... * @count: count of open processes, reaching zero cancels all the work for * this tty and drops a @kref too (but does not free this tty) * @winsize: size of the terminal "window" (cf. @winsize_mutex) * @flow: flow settings grouped together, see also @flow.unused * @flow.lock: lock for @flow members * @flow.stopped: tty stopped/started by stop_tty()/start_tty() * @flow.tco_stopped: tty stopped/started by %TCOOFF/%TCOON ioctls (it has * precedence over @flow.stopped) * @flow.unused: alignment for Alpha, so that no members other than @flow.* are * modified by the same 64b word store. The @flow's __aligned is * there for the very same reason. * @ctrl: control settings grouped together, see also @ctrl.unused * @ctrl.lock: lock for @ctrl members * @ctrl.pgrp: process group of this tty (setpgrp(2)) * @ctrl.session: session of this tty (setsid(2)). Writes are protected by both * @ctrl.lock and @legacy_mutex, readers must use at least one of * them. * @ctrl.pktstatus: packet mode status (bitwise OR of %TIOCPKT_ constants) * @ctrl.packet: packet mode enabled * @ctrl.unused: alignment for Alpha, see @flow.unused for explanation * @hw_stopped: not controlled by the tty layer, under @driver's control for CTS * handling * @receive_room: bytes permitted to feed to @ldisc without any being lost * @flow_change: controls behavior of throttling, see tty_throttle_safe() and * tty_unthrottle_safe() * @link: link to another pty (master -> slave and vice versa) * @fasync: state for %O_ASYNC (for %SIGIO); managed by fasync_helper() * @write_wait: concurrent writers are waiting in this queue until they are * allowed to write * @read_wait: readers wait for data in this queue * @hangup_work: normally a work to perform a hangup (do_tty_hangup()); while * freeing the tty, (re)used to release_one_tty() * @disc_data: pointer to @ldisc's private data (e.g. to &struct n_tty_data) * @driver_data: pointer to @driver's private data (e.g. &struct uart_state) * @files_lock: protects @tty_files list * @tty_files: list of (re)openers of this tty (i.e. linked &struct * tty_file_private) * @closing: when set during close, n_tty processes only START & STOP chars * @write_buf: temporary buffer used during tty_write() to copy user data to * @write_cnt: count of bytes written in tty_write() to @write_buf * @SAK_work: if the tty has a pending do_SAK, it is queued here * @port: persistent storage for this device (i.e. &struct tty_port) * * All of the state associated with a tty while the tty is open. Persistent * storage for tty devices is referenced here as @port and is documented in * &struct tty_port. */ struct tty_struct { struct kref kref; struct device *dev; struct tty_driver *driver; const struct tty_operations *ops; int index; struct ld_semaphore ldisc_sem; struct tty_ldisc *ldisc; struct mutex atomic_write_lock; struct mutex legacy_mutex; struct mutex throttle_mutex; struct rw_semaphore termios_rwsem; struct mutex winsize_mutex; struct ktermios termios, termios_locked; char name[64]; unsigned long flags; int count; struct winsize winsize; struct { spinlock_t lock; bool stopped; bool tco_stopped; unsigned long unused[0]; } __aligned(sizeof(unsigned long)) flow; struct { spinlock_t lock; struct pid *pgrp; struct pid *session; unsigned char pktstatus; bool packet; unsigned long unused[0]; } __aligned(sizeof(unsigned long)) ctrl; int hw_stopped; unsigned int receive_room; int flow_change; struct tty_struct *link; struct fasync_struct *fasync; wait_queue_head_t write_wait; wait_queue_head_t read_wait; struct work_struct hangup_work; void *disc_data; void *driver_data; spinlock_t files_lock; struct list_head tty_files; #define N_TTY_BUF_SIZE 4096 int closing; unsigned char *write_buf; int write_cnt; struct work_struct SAK_work; struct tty_port *port; ANDROID_KABI_RESERVE(1); ANDROID_KABI_RESERVE(2); } __randomize_layout; /* Each of a tty's open files has private_data pointing to tty_file_private */ struct tty_file_private { struct tty_struct *tty; struct file *file; struct list_head list; }; /** * DOC: TTY Struct Flags * * These bits are used in the :c:member:`tty_struct.flags` field. * * So that interrupts won't be able to mess up the queues, * copy_to_cooked must be atomic with respect to itself, as must * tty->write. Thus, you must use the inline functions set_bit() and * clear_bit() to make things atomic. * * TTY_THROTTLED * Driver input is throttled. The ldisc should call * :c:member:`tty_driver.unthrottle()` in order to resume reception when * it is ready to process more data (at threshold min). * * TTY_IO_ERROR * If set, causes all subsequent userspace read/write calls on the tty to * fail, returning -%EIO. (May be no ldisc too.) * * TTY_OTHER_CLOSED * Device is a pty and the other side has closed. * * TTY_EXCLUSIVE * Exclusive open mode (a single opener). * * TTY_DO_WRITE_WAKEUP * If set, causes the driver to call the * :c:member:`tty_ldisc_ops.write_wakeup()` method in order to resume * transmission when it can accept more data to transmit. * * TTY_LDISC_OPEN * Indicates that a line discipline is open. For debugging purposes only. * * TTY_PTY_LOCK * A flag private to pty code to implement %TIOCSPTLCK/%TIOCGPTLCK logic. * * TTY_NO_WRITE_SPLIT * Prevent driver from splitting up writes into smaller chunks (preserve * write boundaries to driver). * * TTY_HUPPED * The TTY was hung up. This is set post :c:member:`tty_driver.hangup()`. * * TTY_HUPPING * The TTY is in the process of hanging up to abort potential readers. * * TTY_LDISC_CHANGING * Line discipline for this TTY is being changed. I/O should not block * when this is set. Use tty_io_nonblock() to check. * * TTY_LDISC_HALTED * Line discipline for this TTY was stopped. No work should be queued to * this ldisc. */ #define TTY_THROTTLED 0 #define TTY_IO_ERROR 1 #define TTY_OTHER_CLOSED 2 #define TTY_EXCLUSIVE 3 #define TTY_DO_WRITE_WAKEUP 5 #define TTY_LDISC_OPEN 11 #define TTY_PTY_LOCK 16 #define TTY_NO_WRITE_SPLIT 17 #define TTY_HUPPED 18 #define TTY_HUPPING 19 #define TTY_LDISC_CHANGING 20 #define TTY_LDISC_HALTED 22 static inline bool tty_io_nonblock(struct tty_struct *tty, struct file *file) { return file->f_flags & O_NONBLOCK || test_bit(TTY_LDISC_CHANGING, &tty->flags); } static inline bool tty_io_error(struct tty_struct *tty) { return test_bit(TTY_IO_ERROR, &tty->flags); } static inline bool tty_throttled(struct tty_struct *tty) { return test_bit(TTY_THROTTLED, &tty->flags); } #ifdef CONFIG_TTY void tty_kref_put(struct tty_struct *tty); struct pid *tty_get_pgrp(struct tty_struct *tty); void tty_vhangup_self(void); void disassociate_ctty(int priv); dev_t tty_devnum(struct tty_struct *tty); void proc_clear_tty(struct task_struct *p); struct tty_struct *get_current_tty(void); /* tty_io.c */ int __init tty_init(void); const char *tty_name(const struct tty_struct *tty); struct tty_struct *tty_kopen_exclusive(dev_t device); struct tty_struct *tty_kopen_shared(dev_t device); void tty_kclose(struct tty_struct *tty); int tty_dev_name_to_number(const char *name, dev_t *number); #else static inline void tty_kref_put(struct tty_struct *tty) { } static inline struct pid *tty_get_pgrp(struct tty_struct *tty) { return NULL; } static inline void tty_vhangup_self(void) { } static inline void disassociate_ctty(int priv) { } static inline dev_t tty_devnum(struct tty_struct *tty) { return 0; } static inline void proc_clear_tty(struct task_struct *p) { } static inline struct tty_struct *get_current_tty(void) { return NULL; } /* tty_io.c */ static inline int __init tty_init(void) { return 0; } static inline const char *tty_name(const struct tty_struct *tty) { return "(none)"; } static inline struct tty_struct *tty_kopen_exclusive(dev_t device) { return ERR_PTR(-ENODEV); } static inline void tty_kclose(struct tty_struct *tty) { } static inline int tty_dev_name_to_number(const char *name, dev_t *number) { return -ENOTSUPP; } #endif extern struct ktermios tty_std_termios; int vcs_init(void); extern struct class *tty_class; /** * tty_kref_get - get a tty reference * @tty: tty device * * Return a new reference to a tty object. The caller must hold * sufficient locks/counts to ensure that their existing reference cannot * go away */ static inline struct tty_struct *tty_kref_get(struct tty_struct *tty) { if (tty) kref_get(&tty->kref); return tty; } const char *tty_driver_name(const struct tty_struct *tty); void tty_wait_until_sent(struct tty_struct *tty, long timeout); void stop_tty(struct tty_struct *tty); void start_tty(struct tty_struct *tty); void tty_write_message(struct tty_struct *tty, char *msg); int tty_send_xchar(struct tty_struct *tty, char ch); int tty_put_char(struct tty_struct *tty, unsigned char c); unsigned int tty_chars_in_buffer(struct tty_struct *tty); unsigned int tty_write_room(struct tty_struct *tty); void tty_driver_flush_buffer(struct tty_struct *tty); void tty_unthrottle(struct tty_struct *tty); int tty_throttle_safe(struct tty_struct *tty); int tty_unthrottle_safe(struct tty_struct *tty); int tty_do_resize(struct tty_struct *tty, struct winsize *ws); int tty_get_icount(struct tty_struct *tty, struct serial_icounter_struct *icount); int is_current_pgrp_orphaned(void); void tty_hangup(struct tty_struct *tty); void tty_vhangup(struct tty_struct *tty); int tty_hung_up_p(struct file *filp); void do_SAK(struct tty_struct *tty); void __do_SAK(struct tty_struct *tty); void no_tty(void); speed_t tty_termios_baud_rate(const struct ktermios *termios); void tty_termios_encode_baud_rate(struct ktermios *termios, speed_t ibaud, speed_t obaud); void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud, speed_t obaud); /** * tty_get_baud_rate - get tty bit rates * @tty: tty to query * * Returns the baud rate as an integer for this terminal. The * termios lock must be held by the caller and the terminal bit * flags may be updated. * * Locking: none */ static inline speed_t tty_get_baud_rate(struct tty_struct *tty) { return tty_termios_baud_rate(&tty->termios); } unsigned char tty_get_char_size(unsigned int cflag); unsigned char tty_get_frame_size(unsigned int cflag); void tty_termios_copy_hw(struct ktermios *new, const struct ktermios *old); int tty_termios_hw_change(const struct ktermios *a, const struct ktermios *b); int tty_set_termios(struct tty_struct *tty, struct ktermios *kt); void tty_wakeup(struct tty_struct *tty); int tty_mode_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg); int tty_perform_flush(struct tty_struct *tty, unsigned long arg); struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx); void tty_release_struct(struct tty_struct *tty, int idx); void tty_init_termios(struct tty_struct *tty); void tty_save_termios(struct tty_struct *tty); int tty_standard_install(struct tty_driver *driver, struct tty_struct *tty); extern struct mutex tty_mutex; /* n_tty.c */ void n_tty_inherit_ops(struct tty_ldisc_ops *ops); #ifdef CONFIG_TTY void __init n_tty_init(void); #else static inline void n_tty_init(void) { } #endif /* tty_audit.c */ #ifdef CONFIG_AUDIT void tty_audit_exit(void); void tty_audit_fork(struct signal_struct *sig); int tty_audit_push(void); #else static inline void tty_audit_exit(void) { } static inline void tty_audit_fork(struct signal_struct *sig) { } static inline int tty_audit_push(void) { return 0; } #endif /* tty_ioctl.c */ int n_tty_ioctl_helper(struct tty_struct *tty, unsigned int cmd, unsigned long arg); /* vt.c */ int vt_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg); long vt_compat_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg); /* tty_mutex.c */ /* functions for preparation of BKL removal */ void tty_lock(struct tty_struct *tty); int tty_lock_interruptible(struct tty_struct *tty); void tty_unlock(struct tty_struct *tty); void tty_lock_slave(struct tty_struct *tty); void tty_unlock_slave(struct tty_struct *tty); void tty_set_lock_subclass(struct tty_struct *tty); #endif |
35108 27945 26079 27925 24778 6206 10 24890 22955 586 1987 35820 3781 25947 28037 27925 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 | // SPDX-License-Identifier: GPL-2.0-only /* * This implements the various checks for CONFIG_HARDENED_USERCOPY*, * which are designed to protect kernel memory from needless exposure * and overwrite under many unintended conditions. This code is based * on PAX_USERCOPY, which is: * * Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source * Security Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mm.h> #include <linux/highmem.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/thread_info.h> #include <linux/vmalloc.h> #include <linux/atomic.h> #include <linux/jump_label.h> #include <asm/sections.h> #include "slab.h" /* * Checks if a given pointer and length is contained by the current * stack frame (if possible). * * Returns: * NOT_STACK: not at all on the stack * GOOD_FRAME: fully within a valid stack frame * GOOD_STACK: within the current stack (when can't frame-check exactly) * BAD_STACK: error condition (invalid stack position or bad stack frame) */ static noinline int check_stack_object(const void *obj, unsigned long len) { const void * const stack = task_stack_page(current); const void * const stackend = stack + THREAD_SIZE; int ret; /* Object is not on the stack at all. */ if (obj + len <= stack || stackend <= obj) return NOT_STACK; /* * Reject: object partially overlaps the stack (passing the * check above means at least one end is within the stack, * so if this check fails, the other end is outside the stack). */ if (obj < stack || stackend < obj + len) return BAD_STACK; /* Check if object is safely within a valid frame. */ ret = arch_within_stack_frames(stack, stackend, obj, len); if (ret) return ret; /* Finally, check stack depth if possible. */ #ifdef CONFIG_ARCH_HAS_CURRENT_STACK_POINTER if (IS_ENABLED(CONFIG_STACK_GROWSUP)) { if ((void *)current_stack_pointer < obj + len) return BAD_STACK; } else { if (obj < (void *)current_stack_pointer) return BAD_STACK; } #endif return GOOD_STACK; } /* * If these functions are reached, then CONFIG_HARDENED_USERCOPY has found * an unexpected state during a copy_from_user() or copy_to_user() call. * There are several checks being performed on the buffer by the * __check_object_size() function. Normal stack buffer usage should never * trip the checks, and kernel text addressing will always trip the check. * For cache objects, it is checking that only the whitelisted range of * bytes for a given cache is being accessed (via the cache's usersize and * useroffset fields). To adjust a cache whitelist, use the usercopy-aware * kmem_cache_create_usercopy() function to create the cache (and * carefully audit the whitelist range). */ void __noreturn usercopy_abort(const char *name, const char *detail, bool to_user, unsigned long offset, unsigned long len) { pr_emerg("Kernel memory %s attempt detected %s %s%s%s%s (offset %lu, size %lu)!\n", to_user ? "exposure" : "overwrite", to_user ? "from" : "to", name ? : "unknown?!", detail ? " '" : "", detail ? : "", detail ? "'" : "", offset, len); /* * For greater effect, it would be nice to do do_group_exit(), * but BUG() actually hooks all the lock-breaking and per-arch * Oops code, so that is used here instead. */ BUG(); } /* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */ static bool overlaps(const unsigned long ptr, unsigned long n, unsigned long low, unsigned long high) { const unsigned long check_low = ptr; unsigned long check_high = check_low + n; /* Does not overlap if entirely above or entirely below. */ if (check_low >= high || check_high <= low) return false; return true; } /* Is this address range in the kernel text area? */ static inline void check_kernel_text_object(const unsigned long ptr, unsigned long n, bool to_user) { unsigned long textlow = (unsigned long)_stext; unsigned long texthigh = (unsigned long)_etext; unsigned long textlow_linear, texthigh_linear; if (overlaps(ptr, n, textlow, texthigh)) usercopy_abort("kernel text", NULL, to_user, ptr - textlow, n); /* * Some architectures have virtual memory mappings with a secondary * mapping of the kernel text, i.e. there is more than one virtual * kernel address that points to the kernel image. It is usually * when there is a separate linear physical memory mapping, in that * __pa() is not just the reverse of __va(). This can be detected * and checked: */ textlow_linear = (unsigned long)lm_alias(textlow); /* No different mapping: we're done. */ if (textlow_linear == textlow) return; /* Check the secondary mapping... */ texthigh_linear = (unsigned long)lm_alias(texthigh); if (overlaps(ptr, n, textlow_linear, texthigh_linear)) usercopy_abort("linear kernel text", NULL, to_user, ptr - textlow_linear, n); } static inline void check_bogus_address(const unsigned long ptr, unsigned long n, bool to_user) { /* Reject if object wraps past end of memory. */ if (ptr + (n - 1) < ptr) usercopy_abort("wrapped address", NULL, to_user, 0, ptr + n); /* Reject if NULL or ZERO-allocation. */ if (ZERO_OR_NULL_PTR(ptr)) usercopy_abort("null address", NULL, to_user, ptr, n); } static inline void check_heap_object(const void *ptr, unsigned long n, bool to_user) { unsigned long addr = (unsigned long)ptr; unsigned long offset; struct folio *folio; if (is_kmap_addr(ptr)) { offset = offset_in_page(ptr); if (n > PAGE_SIZE - offset) usercopy_abort("kmap", NULL, to_user, offset, n); return; } if (is_vmalloc_addr(ptr) && !pagefault_disabled()) { struct vmap_area *area = find_vmap_area(addr); if (!area) usercopy_abort("vmalloc", "no area", to_user, 0, n); if (n > area->va_end - addr) { offset = addr - area->va_start; usercopy_abort("vmalloc", NULL, to_user, offset, n); } return; } if (!virt_addr_valid(ptr)) return; folio = virt_to_folio(ptr); if (folio_test_slab(folio)) { /* Check slab allocator for flags and size. */ __check_heap_object(ptr, n, folio_slab(folio), to_user); } else if (folio_test_large(folio)) { offset = ptr - folio_address(folio); if (n > folio_size(folio) - offset) usercopy_abort("page alloc", NULL, to_user, offset, n); } } static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks); /* * Validates that the given object is: * - not bogus address * - fully contained by stack (or stack frame, when available) * - fully within SLAB object (or object whitelist area, when available) * - not in kernel text */ void __check_object_size(const void *ptr, unsigned long n, bool to_user) { if (static_branch_unlikely(&bypass_usercopy_checks)) return; /* Skip all tests if size is zero. */ if (!n) return; /* Check for invalid addresses. */ check_bogus_address((const unsigned long)ptr, n, to_user); /* Check for bad stack object. */ switch (check_stack_object(ptr, n)) { case NOT_STACK: /* Object is not touching the current process stack. */ break; case GOOD_FRAME: case GOOD_STACK: /* * Object is either in the correct frame (when it * is possible to check) or just generally on the * process stack (when frame checking not available). */ return; default: usercopy_abort("process stack", NULL, to_user, #ifdef CONFIG_ARCH_HAS_CURRENT_STACK_POINTER IS_ENABLED(CONFIG_STACK_GROWSUP) ? ptr - (void *)current_stack_pointer : (void *)current_stack_pointer - ptr, #else 0, #endif n); } /* Check for bad heap object. */ check_heap_object(ptr, n, to_user); /* Check for object in kernel to avoid text exposure. */ check_kernel_text_object((const unsigned long)ptr, n, to_user); } EXPORT_SYMBOL(__check_object_size); static bool enable_checks __initdata = true; static int __init parse_hardened_usercopy(char *str) { if (strtobool(str, &enable_checks)) pr_warn("Invalid option string for hardened_usercopy: '%s'\n", str); return 1; } __setup("hardened_usercopy=", parse_hardened_usercopy); static int __init set_hardened_usercopy(void) { if (enable_checks == false) static_branch_enable(&bypass_usercopy_checks); return 1; } late_initcall(set_hardened_usercopy); |
6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | // SPDX-License-Identifier: GPL-2.0 #include <linux/cpufreq.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> extern const struct seq_operations cpuinfo_op; static int cpuinfo_open(struct inode *inode, struct file *file) { return seq_open(file, &cpuinfo_op); } static const struct proc_ops cpuinfo_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = cpuinfo_open, .proc_read_iter = seq_read_iter, .proc_lseek = seq_lseek, .proc_release = seq_release, }; static int __init proc_cpuinfo_init(void) { proc_create("cpuinfo", 0, NULL, &cpuinfo_proc_ops); return 0; } fs_initcall(proc_cpuinfo_init); |
1 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 | // SPDX-License-Identifier: GPL-2.0-or-later /* * CTR: Counter mode * * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> */ #include <crypto/algapi.h> #include <crypto/ctr.h> #include <crypto/internal/cipher.h> #include <crypto/internal/skcipher.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> struct crypto_rfc3686_ctx { struct crypto_skcipher *child; u8 nonce[CTR_RFC3686_NONCE_SIZE]; }; struct crypto_rfc3686_req_ctx { u8 iv[CTR_RFC3686_BLOCK_SIZE]; struct skcipher_request subreq CRYPTO_MINALIGN_ATTR; }; static void crypto_ctr_crypt_final(struct skcipher_walk *walk, struct crypto_cipher *tfm) { unsigned int bsize = crypto_cipher_blocksize(tfm); unsigned long alignmask = crypto_cipher_alignmask(tfm); u8 *ctrblk = walk->iv; u8 tmp[MAX_CIPHER_BLOCKSIZE + MAX_CIPHER_ALIGNMASK]; u8 *keystream = PTR_ALIGN(tmp + 0, alignmask + 1); u8 *src = walk->src.virt.addr; u8 *dst = walk->dst.virt.addr; unsigned int nbytes = walk->nbytes; crypto_cipher_encrypt_one(tfm, keystream, ctrblk); crypto_xor_cpy(dst, keystream, src, nbytes); crypto_inc(ctrblk, bsize); } static int crypto_ctr_crypt_segment(struct skcipher_walk *walk, struct crypto_cipher *tfm) { void (*fn)(struct crypto_tfm *, u8 *, const u8 *) = crypto_cipher_alg(tfm)->cia_encrypt; unsigned int bsize = crypto_cipher_blocksize(tfm); u8 *ctrblk = walk->iv; u8 *src = walk->src.virt.addr; u8 *dst = walk->dst.virt.addr; unsigned int nbytes = walk->nbytes; do { /* create keystream */ fn(crypto_cipher_tfm(tfm), dst, ctrblk); crypto_xor(dst, src, bsize); /* increment counter in counterblock */ crypto_inc(ctrblk, bsize); src += bsize; dst += bsize; } while ((nbytes -= bsize) >= bsize); return nbytes; } static int crypto_ctr_crypt_inplace(struct skcipher_walk *walk, struct crypto_cipher *tfm) { void (*fn)(struct crypto_tfm *, u8 *, const u8 *) = crypto_cipher_alg(tfm)->cia_encrypt; unsigned int bsize = crypto_cipher_blocksize(tfm); unsigned long alignmask = crypto_cipher_alignmask(tfm); unsigned int nbytes = walk->nbytes; u8 *ctrblk = walk->iv; u8 *src = walk->src.virt.addr; u8 tmp[MAX_CIPHER_BLOCKSIZE + MAX_CIPHER_ALIGNMASK]; u8 *keystream = PTR_ALIGN(tmp + 0, alignmask + 1); do { /* create keystream */ fn(crypto_cipher_tfm(tfm), keystream, ctrblk); crypto_xor(src, keystream, bsize); /* increment counter in counterblock */ crypto_inc(ctrblk, bsize); src += bsize; } while ((nbytes -= bsize) >= bsize); return nbytes; } static int crypto_ctr_crypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_cipher *cipher = skcipher_cipher_simple(tfm); const unsigned int bsize = crypto_cipher_blocksize(cipher); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while (walk.nbytes >= bsize) { if (walk.src.virt.addr == walk.dst.virt.addr) nbytes = crypto_ctr_crypt_inplace(&walk, cipher); else nbytes = crypto_ctr_crypt_segment(&walk, cipher); err = skcipher_walk_done(&walk, nbytes); } if (walk.nbytes) { crypto_ctr_crypt_final(&walk, cipher); err = skcipher_walk_done(&walk, 0); } return err; } static int crypto_ctr_create(struct crypto_template *tmpl, struct rtattr **tb) { struct skcipher_instance *inst; struct crypto_alg *alg; int err; inst = skcipher_alloc_instance_simple(tmpl, tb); if (IS_ERR(inst)) return PTR_ERR(inst); alg = skcipher_ialg_simple(inst); /* Block size must be >= 4 bytes. */ err = -EINVAL; if (alg->cra_blocksize < 4) goto out_free_inst; /* If this is false we'd fail the alignment of crypto_inc. */ if (alg->cra_blocksize % 4) goto out_free_inst; /* CTR mode is a stream cipher. */ inst->alg.base.cra_blocksize = 1; /* * To simplify the implementation, configure the skcipher walk to only * give a partial block at the very end, never earlier. */ inst->alg.chunksize = alg->cra_blocksize; inst->alg.encrypt = crypto_ctr_crypt; inst->alg.decrypt = crypto_ctr_crypt; err = skcipher_register_instance(tmpl, inst); if (err) { out_free_inst: inst->free(inst); } return err; } static int crypto_rfc3686_setkey(struct crypto_skcipher *parent, const u8 *key, unsigned int keylen) { struct crypto_rfc3686_ctx *ctx = crypto_skcipher_ctx(parent); struct crypto_skcipher *child = ctx->child; /* the nonce is stored in bytes at end of key */ if (keylen < CTR_RFC3686_NONCE_SIZE) return -EINVAL; memcpy(ctx->nonce, key + (keylen - CTR_RFC3686_NONCE_SIZE), CTR_RFC3686_NONCE_SIZE); keylen -= CTR_RFC3686_NONCE_SIZE; crypto_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(child, crypto_skcipher_get_flags(parent) & CRYPTO_TFM_REQ_MASK); return crypto_skcipher_setkey(child, key, keylen); } static int crypto_rfc3686_crypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_rfc3686_ctx *ctx = crypto_skcipher_ctx(tfm); struct crypto_skcipher *child = ctx->child; unsigned long align = crypto_skcipher_alignmask(tfm); struct crypto_rfc3686_req_ctx *rctx = (void *)PTR_ALIGN((u8 *)skcipher_request_ctx(req), align + 1); struct skcipher_request *subreq = &rctx->subreq; u8 *iv = rctx->iv; /* set up counter block */ memcpy(iv, ctx->nonce, CTR_RFC3686_NONCE_SIZE); memcpy(iv + CTR_RFC3686_NONCE_SIZE, req->iv, CTR_RFC3686_IV_SIZE); /* initialize counter portion of counter block */ *(__be32 *)(iv + CTR_RFC3686_NONCE_SIZE + CTR_RFC3686_IV_SIZE) = cpu_to_be32(1); skcipher_request_set_tfm(subreq, child); skcipher_request_set_callback(subreq, req->base.flags, req->base.complete, req->base.data); skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen, iv); return crypto_skcipher_encrypt(subreq); } static int crypto_rfc3686_init_tfm(struct crypto_skcipher *tfm) { struct skcipher_instance *inst = skcipher_alg_instance(tfm); struct crypto_skcipher_spawn *spawn = skcipher_instance_ctx(inst); struct crypto_rfc3686_ctx *ctx = crypto_skcipher_ctx(tfm); struct crypto_skcipher *cipher; unsigned long align; unsigned int reqsize; cipher = crypto_spawn_skcipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; align = crypto_skcipher_alignmask(tfm); align &= ~(crypto_tfm_ctx_alignment() - 1); reqsize = align + sizeof(struct crypto_rfc3686_req_ctx) + crypto_skcipher_reqsize(cipher); crypto_skcipher_set_reqsize(tfm, reqsize); return 0; } static void crypto_rfc3686_exit_tfm(struct crypto_skcipher *tfm) { struct crypto_rfc3686_ctx *ctx = crypto_skcipher_ctx(tfm); crypto_free_skcipher(ctx->child); } static void crypto_rfc3686_free(struct skcipher_instance *inst) { struct crypto_skcipher_spawn *spawn = skcipher_instance_ctx(inst); crypto_drop_skcipher(spawn); kfree(inst); } static int crypto_rfc3686_create(struct crypto_template *tmpl, struct rtattr **tb) { struct skcipher_instance *inst; struct skcipher_alg *alg; struct crypto_skcipher_spawn *spawn; u32 mask; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SKCIPHER, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return -ENOMEM; spawn = skcipher_instance_ctx(inst); err = crypto_grab_skcipher(spawn, skcipher_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; alg = crypto_spawn_skcipher_alg(spawn); /* We only support 16-byte blocks. */ err = -EINVAL; if (crypto_skcipher_alg_ivsize(alg) != CTR_RFC3686_BLOCK_SIZE) goto err_free_inst; /* Not a stream cipher? */ if (alg->base.cra_blocksize != 1) goto err_free_inst; err = -ENAMETOOLONG; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "rfc3686(%s)", alg->base.cra_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "rfc3686(%s)", alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; inst->alg.base.cra_priority = alg->base.cra_priority; inst->alg.base.cra_blocksize = 1; inst->alg.base.cra_alignmask = alg->base.cra_alignmask; inst->alg.ivsize = CTR_RFC3686_IV_SIZE; inst->alg.chunksize = crypto_skcipher_alg_chunksize(alg); inst->alg.min_keysize = crypto_skcipher_alg_min_keysize(alg) + CTR_RFC3686_NONCE_SIZE; inst->alg.max_keysize = crypto_skcipher_alg_max_keysize(alg) + CTR_RFC3686_NONCE_SIZE; inst->alg.setkey = crypto_rfc3686_setkey; inst->alg.encrypt = crypto_rfc3686_crypt; inst->alg.decrypt = crypto_rfc3686_crypt; inst->alg.base.cra_ctxsize = sizeof(struct crypto_rfc3686_ctx); inst->alg.init = crypto_rfc3686_init_tfm; inst->alg.exit = crypto_rfc3686_exit_tfm; inst->free = crypto_rfc3686_free; err = skcipher_register_instance(tmpl, inst); if (err) { err_free_inst: crypto_rfc3686_free(inst); } return err; } static struct crypto_template crypto_ctr_tmpls[] = { { .name = "ctr", .create = crypto_ctr_create, .module = THIS_MODULE, }, { .name = "rfc3686", .create = crypto_rfc3686_create, .module = THIS_MODULE, }, }; static int __init crypto_ctr_module_init(void) { return crypto_register_templates(crypto_ctr_tmpls, ARRAY_SIZE(crypto_ctr_tmpls)); } static void __exit crypto_ctr_module_exit(void) { crypto_unregister_templates(crypto_ctr_tmpls, ARRAY_SIZE(crypto_ctr_tmpls)); } subsys_initcall(crypto_ctr_module_init); module_exit(crypto_ctr_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("CTR block cipher mode of operation"); MODULE_ALIAS_CRYPTO("rfc3686"); MODULE_ALIAS_CRYPTO("ctr"); MODULE_IMPORT_NS(CRYPTO_INTERNAL); |
35 46 36 36 36 36 113 113 414 411 4 411 4 422 415 10 37 414 409 6 403 408 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 | // SPDX-License-Identifier: GPL-2.0 /* * Functions related to setting various queue properties from drivers */ #define pr_fmt(fmt) "%s: " fmt, __func__ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/pagemap.h> #include <linux/backing-dev-defs.h> #include <linux/gcd.h> #include <linux/lcm.h> #include <linux/jiffies.h> #include <linux/gfp.h> #include <linux/dma-mapping.h> #include "blk.h" #include "blk-wbt.h" /* Protects blk_nr_sub_page_limit_queues and blk_sub_page_limits changes. */ static DEFINE_MUTEX(blk_sub_page_limit_lock); static uint32_t blk_nr_sub_page_limit_queues; DEFINE_STATIC_KEY_FALSE(blk_sub_page_limits); void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout) { q->rq_timeout = timeout; } EXPORT_SYMBOL_GPL(blk_queue_rq_timeout); /** * blk_set_default_limits - reset limits to default values * @lim: the queue_limits structure to reset * * Description: * Returns a queue_limit struct to its default state. */ void blk_set_default_limits(struct queue_limits *lim) { lim->max_segments = BLK_MAX_SEGMENTS; lim->max_discard_segments = 1; lim->max_integrity_segments = 0; lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; lim->virt_boundary_mask = 0; lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; lim->max_dev_sectors = 0; lim->chunk_sectors = 0; lim->max_write_zeroes_sectors = 0; lim->max_zone_append_sectors = 0; lim->max_discard_sectors = 0; lim->max_hw_discard_sectors = 0; lim->max_secure_erase_sectors = 0; lim->discard_granularity = 0; lim->discard_alignment = 0; lim->discard_misaligned = 0; lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; lim->bounce = BLK_BOUNCE_NONE; lim->alignment_offset = 0; lim->io_opt = 0; lim->misaligned = 0; lim->zoned = BLK_ZONED_NONE; lim->zone_write_granularity = 0; lim->dma_alignment = 511; lim->sub_page_limits = false; } /** * blk_set_stacking_limits - set default limits for stacking devices * @lim: the queue_limits structure to reset * * Description: * Returns a queue_limit struct to its default state. Should be used * by stacking drivers like DM that have no internal limits. */ void blk_set_stacking_limits(struct queue_limits *lim) { blk_set_default_limits(lim); /* Inherit limits from component devices */ lim->max_segments = USHRT_MAX; lim->max_discard_segments = USHRT_MAX; lim->max_hw_sectors = UINT_MAX; lim->max_segment_size = UINT_MAX; lim->max_sectors = UINT_MAX; lim->max_dev_sectors = UINT_MAX; lim->max_write_zeroes_sectors = UINT_MAX; lim->max_zone_append_sectors = UINT_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); /** * blk_queue_bounce_limit - set bounce buffer limit for queue * @q: the request queue for the device * @bounce: bounce limit to enforce * * Description: * Force bouncing for ISA DMA ranges or highmem. * * DEPRECATED, don't use in new code. **/ void blk_queue_bounce_limit(struct request_queue *q, enum blk_bounce bounce) { q->limits.bounce = bounce; } EXPORT_SYMBOL(blk_queue_bounce_limit); /* For debugfs. */ int blk_sub_page_limit_queues_get(void *data, u64 *val) { *val = READ_ONCE(blk_nr_sub_page_limit_queues); return 0; } /** * blk_enable_sub_page_limits - enable support for limits below the page size * @lim: request queue limits for which to enable support of these features. * * Enable support for max_segment_size values smaller than PAGE_SIZE and for * max_hw_sectors values below PAGE_SIZE >> SECTOR_SHIFT. Support for these * features is not enabled all the time because of the runtime overhead of these * features. */ static void blk_enable_sub_page_limits(struct queue_limits *lim) { if (lim->sub_page_limits) return; lim->sub_page_limits = true; mutex_lock(&blk_sub_page_limit_lock); if (++blk_nr_sub_page_limit_queues == 1) static_branch_enable(&blk_sub_page_limits); mutex_unlock(&blk_sub_page_limit_lock); } /** * blk_disable_sub_page_limits - disable support for limits below the page size * @lim: request queue limits for which to enable support of these features. * * max_segment_size values smaller than PAGE_SIZE and for max_hw_sectors values * below PAGE_SIZE >> SECTOR_SHIFT. Support for these features is not enabled * all the time because of the runtime overhead of these features. */ void blk_disable_sub_page_limits(struct queue_limits *lim) { if (!lim->sub_page_limits) return; lim->sub_page_limits = false; mutex_lock(&blk_sub_page_limit_lock); WARN_ON_ONCE(blk_nr_sub_page_limit_queues <= 0); if (--blk_nr_sub_page_limit_queues == 0) static_branch_disable(&blk_sub_page_limits); mutex_unlock(&blk_sub_page_limit_lock); } /** * blk_queue_max_hw_sectors - set max sectors for a request for this queue * @q: the request queue for the device * @max_hw_sectors: max hardware sectors in the usual 512b unit * * Description: * Enables a low level driver to set a hard upper limit, * max_hw_sectors, on the size of requests. max_hw_sectors is set by * the device driver based upon the capabilities of the I/O * controller. * * max_dev_sectors is a hard limit imposed by the storage device for * READ/WRITE requests. It is set by the disk driver. * * max_sectors is a soft limit imposed by the block layer for * filesystem type requests. This value can be overridden on a * per-device basis in /sys/block/<device>/queue/max_sectors_kb. * The soft limit can not exceed max_hw_sectors. **/ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) { struct queue_limits *limits = &q->limits; unsigned int min_max_hw_sectors = PAGE_SIZE >> SECTOR_SHIFT; unsigned int max_sectors; if (max_hw_sectors < min_max_hw_sectors) { blk_enable_sub_page_limits(limits); min_max_hw_sectors = 1; } if (max_hw_sectors < min_max_hw_sectors) { max_hw_sectors = min_max_hw_sectors; pr_info("set to minimum %u\n", max_hw_sectors); } max_hw_sectors = round_down(max_hw_sectors, limits->logical_block_size >> SECTOR_SHIFT); limits->max_hw_sectors = max_hw_sectors; max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors); max_sectors = min(max_sectors, BLK_DEF_MAX_SECTORS); max_sectors = round_down(max_sectors, limits->logical_block_size >> SECTOR_SHIFT); limits->max_sectors = max_sectors; if (!q->disk) return; q->disk->bdi->io_pages = max_sectors >> (PAGE_SHIFT - 9); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); /** * blk_queue_chunk_sectors - set size of the chunk for this queue * @q: the request queue for the device * @chunk_sectors: chunk sectors in the usual 512b unit * * Description: * If a driver doesn't want IOs to cross a given chunk size, it can set * this limit and prevent merging across chunks. Note that the block layer * must accept a page worth of data at any offset. So if the crossing of * chunks is a hard limitation in the driver, it must still be prepared * to split single page bios. **/ void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors) { q->limits.chunk_sectors = chunk_sectors; } EXPORT_SYMBOL(blk_queue_chunk_sectors); /** * blk_queue_max_discard_sectors - set max sectors for a single discard * @q: the request queue for the device * @max_discard_sectors: maximum number of sectors to discard **/ void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors) { q->limits.max_hw_discard_sectors = max_discard_sectors; q->limits.max_discard_sectors = max_discard_sectors; } EXPORT_SYMBOL(blk_queue_max_discard_sectors); /** * blk_queue_max_secure_erase_sectors - set max sectors for a secure erase * @q: the request queue for the device * @max_sectors: maximum number of sectors to secure_erase **/ void blk_queue_max_secure_erase_sectors(struct request_queue *q, unsigned int max_sectors) { q->limits.max_secure_erase_sectors = max_sectors; } EXPORT_SYMBOL(blk_queue_max_secure_erase_sectors); /** * blk_queue_max_write_zeroes_sectors - set max sectors for a single * write zeroes * @q: the request queue for the device * @max_write_zeroes_sectors: maximum number of sectors to write per command **/ void blk_queue_max_write_zeroes_sectors(struct request_queue *q, unsigned int max_write_zeroes_sectors) { q->limits.max_write_zeroes_sectors = max_write_zeroes_sectors; } EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors); /** * blk_queue_max_zone_append_sectors - set max sectors for a single zone append * @q: the request queue for the device * @max_zone_append_sectors: maximum number of sectors to write per command **/ void blk_queue_max_zone_append_sectors(struct request_queue *q, unsigned int max_zone_append_sectors) { unsigned int max_sectors; if (WARN_ON(!blk_queue_is_zoned(q))) return; max_sectors = min(q->limits.max_hw_sectors, max_zone_append_sectors); max_sectors = min(q->limits.chunk_sectors, max_sectors); /* * Signal eventual driver bugs resulting in the max_zone_append sectors limit * being 0 due to a 0 argument, the chunk_sectors limit (zone size) not set, * or the max_hw_sectors limit not set. */ WARN_ON(!max_sectors); q->limits.max_zone_append_sectors = max_sectors; } EXPORT_SYMBOL_GPL(blk_queue_max_zone_append_sectors); /** * blk_queue_max_segments - set max hw segments for a request for this queue * @q: the request queue for the device * @max_segments: max number of segments * * Description: * Enables a low level driver to set an upper limit on the number of * hw data segments in a request. **/ void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments) { if (!max_segments) { max_segments = 1; pr_info("set to minimum %u\n", max_segments); } q->limits.max_segments = max_segments; } EXPORT_SYMBOL(blk_queue_max_segments); /** * blk_queue_max_discard_segments - set max segments for discard requests * @q: the request queue for the device * @max_segments: max number of segments * * Description: * Enables a low level driver to set an upper limit on the number of * segments in a discard request. **/ void blk_queue_max_discard_segments(struct request_queue *q, unsigned short max_segments) { q->limits.max_discard_segments = max_segments; } EXPORT_SYMBOL_GPL(blk_queue_max_discard_segments); /** * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg * @q: the request queue for the device * @max_size: max size of segment in bytes * * Description: * Enables a low level driver to set an upper limit on the size of a * coalesced segment **/ void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size) { unsigned int min_max_segment_size = PAGE_SIZE; if (max_size < min_max_segment_size) { blk_enable_sub_page_limits(&q->limits); min_max_segment_size = SECTOR_SIZE; } if (max_size < min_max_segment_size) { max_size = min_max_segment_size; pr_info("set to minimum %u\n", max_size); } /* see blk_queue_virt_boundary() for the explanation */ WARN_ON_ONCE(q->limits.virt_boundary_mask); q->limits.max_segment_size = max_size; } EXPORT_SYMBOL(blk_queue_max_segment_size); /** * blk_queue_logical_block_size - set logical block size for the queue * @q: the request queue for the device * @size: the logical block size, in bytes * * Description: * This should be set to the lowest possible block size that the * storage device can address. The default of 512 covers most * hardware. **/ void blk_queue_logical_block_size(struct request_queue *q, unsigned int size) { struct queue_limits *limits = &q->limits; limits->logical_block_size = size; if (limits->physical_block_size < size) limits->physical_block_size = size; if (limits->io_min < limits->physical_block_size) limits->io_min = limits->physical_block_size; limits->max_hw_sectors = round_down(limits->max_hw_sectors, size >> SECTOR_SHIFT); limits->max_sectors = round_down(limits->max_sectors, size >> SECTOR_SHIFT); } EXPORT_SYMBOL(blk_queue_logical_block_size); /** * blk_queue_physical_block_size - set physical block size for the queue * @q: the request queue for the device * @size: the physical block size, in bytes * * Description: * This should be set to the lowest possible sector size that the * hardware can operate on without reverting to read-modify-write * operations. */ void blk_queue_physical_block_size(struct request_queue *q, unsigned int size) { q->limits.physical_block_size = size; if (q->limits.physical_block_size < q->limits.logical_block_size) q->limits.physical_block_size = q->limits.logical_block_size; if (q->limits.io_min < q->limits.physical_block_size) q->limits.io_min = q->limits.physical_block_size; } EXPORT_SYMBOL(blk_queue_physical_block_size); /** * blk_queue_zone_write_granularity - set zone write granularity for the queue * @q: the request queue for the zoned device * @size: the zone write granularity size, in bytes * * Description: * This should be set to the lowest possible size allowing to write in * sequential zones of a zoned block device. */ void blk_queue_zone_write_granularity(struct request_queue *q, unsigned int size) { if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) return; q->limits.zone_write_granularity = size; if (q->limits.zone_write_granularity < q->limits.logical_block_size) q->limits.zone_write_granularity = q->limits.logical_block_size; } EXPORT_SYMBOL_GPL(blk_queue_zone_write_granularity); /** * blk_queue_alignment_offset - set physical block alignment offset * @q: the request queue for the device * @offset: alignment offset in bytes * * Description: * Some devices are naturally misaligned to compensate for things like * the legacy DOS partition table 63-sector offset. Low-level drivers * should call this function for devices whose first sector is not * naturally aligned. */ void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset) { q->limits.alignment_offset = offset & (q->limits.physical_block_size - 1); q->limits.misaligned = 0; } EXPORT_SYMBOL(blk_queue_alignment_offset); void disk_update_readahead(struct gendisk *disk) { struct request_queue *q = disk->queue; /* * For read-ahead of large files to be effective, we need to read ahead * at least twice the optimal I/O size. */ disk->bdi->ra_pages = max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); disk->bdi->io_pages = queue_max_sectors(q) >> (PAGE_SHIFT - 9); } EXPORT_SYMBOL_GPL(disk_update_readahead); /** * blk_limits_io_min - set minimum request size for a device * @limits: the queue limits * @min: smallest I/O size in bytes * * Description: * Some devices have an internal block size bigger than the reported * hardware sector size. This function can be used to signal the * smallest I/O the device can perform without incurring a performance * penalty. */ void blk_limits_io_min(struct queue_limits *limits, unsigned int min) { limits->io_min = min; if (limits->io_min < limits->logical_block_size) limits->io_min = limits->logical_block_size; if (limits->io_min < limits->physical_block_size) limits->io_min = limits->physical_block_size; } EXPORT_SYMBOL(blk_limits_io_min); /** * blk_queue_io_min - set minimum request size for the queue * @q: the request queue for the device * @min: smallest I/O size in bytes * * Description: * Storage devices may report a granularity or preferred minimum I/O * size which is the smallest request the device can perform without * incurring a performance penalty. For disk drives this is often the * physical block size. For RAID arrays it is often the stripe chunk * size. A properly aligned multiple of minimum_io_size is the * preferred request size for workloads where a high number of I/O * operations is desired. */ void blk_queue_io_min(struct request_queue *q, unsigned int min) { blk_limits_io_min(&q->limits, min); } EXPORT_SYMBOL(blk_queue_io_min); /** * blk_limits_io_opt - set optimal request size for a device * @limits: the queue limits * @opt: smallest I/O size in bytes * * Description: * Storage devices may report an optimal I/O size, which is the * device's preferred unit for sustained I/O. This is rarely reported * for disk drives. For RAID arrays it is usually the stripe width or * the internal track size. A properly aligned multiple of * optimal_io_size is the preferred request size for workloads where * sustained throughput is desired. */ void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt) { limits->io_opt = opt; } EXPORT_SYMBOL(blk_limits_io_opt); /** * blk_queue_io_opt - set optimal request size for the queue * @q: the request queue for the device * @opt: optimal request size in bytes * * Description: * Storage devices may report an optimal I/O size, which is the * device's preferred unit for sustained I/O. This is rarely reported * for disk drives. For RAID arrays it is usually the stripe width or * the internal track size. A properly aligned multiple of * optimal_io_size is the preferred request size for workloads where * sustained throughput is desired. */ void blk_queue_io_opt(struct request_queue *q, unsigned int opt) { blk_limits_io_opt(&q->limits, opt); if (!q->disk) return; q->disk->bdi->ra_pages = max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); } EXPORT_SYMBOL(blk_queue_io_opt); static int queue_limit_alignment_offset(struct queue_limits *lim, sector_t sector) { unsigned int granularity = max(lim->physical_block_size, lim->io_min); unsigned int alignment = sector_div(sector, granularity >> SECTOR_SHIFT) << SECTOR_SHIFT; return (granularity + lim->alignment_offset - alignment) % granularity; } static unsigned int queue_limit_discard_alignment(struct queue_limits *lim, sector_t sector) { unsigned int alignment, granularity, offset; if (!lim->max_discard_sectors) return 0; /* Why are these in bytes, not sectors? */ alignment = lim->discard_alignment >> SECTOR_SHIFT; granularity = lim->discard_granularity >> SECTOR_SHIFT; if (!granularity) return 0; /* Offset of the partition start in 'granularity' sectors */ offset = sector_div(sector, granularity); /* And why do we do this modulus *again* in blkdev_issue_discard()? */ offset = (granularity + alignment - offset) % granularity; /* Turn it back into bytes, gaah */ return offset << SECTOR_SHIFT; } static unsigned int blk_round_down_sectors(unsigned int sectors, unsigned int lbs) { sectors = round_down(sectors, lbs >> SECTOR_SHIFT); if (sectors < PAGE_SIZE >> SECTOR_SHIFT) sectors = PAGE_SIZE >> SECTOR_SHIFT; return sectors; } /** * blk_stack_limits - adjust queue_limits for stacked devices * @t: the stacking driver limits (top device) * @b: the underlying queue limits (bottom, component device) * @start: first data sector within component device * * Description: * This function is used by stacking drivers like MD and DM to ensure * that all component devices have compatible block sizes and * alignments. The stacking driver must provide a queue_limits * struct (top) and then iteratively call the stacking function for * all component (bottom) devices. The stacking function will * attempt to combine the values and ensure proper alignment. * * Returns 0 if the top and bottom queue_limits are compatible. The * top device's block sizes and alignment offsets may be adjusted to * ensure alignment with the bottom device. If no compatible sizes * and alignments exist, -1 is returned and the resulting top * queue_limits will have the misaligned flag set to indicate that * the alignment_offset is undefined. */ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t start) { unsigned int top, bottom, alignment, ret = 0; t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, b->max_write_zeroes_sectors); t->max_zone_append_sectors = min(t->max_zone_append_sectors, b->max_zone_append_sectors); t->bounce = max(t->bounce, b->bounce); t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask); t->virt_boundary_mask = min_not_zero(t->virt_boundary_mask, b->virt_boundary_mask); t->max_segments = min_not_zero(t->max_segments, b->max_segments); t->max_discard_segments = min_not_zero(t->max_discard_segments, b->max_discard_segments); t->max_integrity_segments = min_not_zero(t->max_integrity_segments, b->max_integrity_segments); t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size); t->misaligned |= b->misaligned; alignment = queue_limit_alignment_offset(b, start); /* Bottom device has different alignment. Check that it is * compatible with the current top alignment. */ if (t->alignment_offset != alignment) { top = max(t->physical_block_size, t->io_min) + t->alignment_offset; bottom = max(b->physical_block_size, b->io_min) + alignment; /* Verify that top and bottom intervals line up */ if (max(top, bottom) % min(top, bottom)) { t->misaligned = 1; ret = -1; } } t->logical_block_size = max(t->logical_block_size, b->logical_block_size); t->physical_block_size = max(t->physical_block_size, b->physical_block_size); t->io_min = max(t->io_min, b->io_min); t->io_opt = lcm_not_zero(t->io_opt, b->io_opt); t->dma_alignment = max(t->dma_alignment, b->dma_alignment); /* Set non-power-of-2 compatible chunk_sectors boundary */ if (b->chunk_sectors) t->chunk_sectors = gcd(t->chunk_sectors, b->chunk_sectors); /* Physical block size a multiple of the logical block size? */ if (t->physical_block_size & (t->logical_block_size - 1)) { t->physical_block_size = t->logical_block_size; t->misaligned = 1; ret = -1; } /* Minimum I/O a multiple of the physical block size? */ if (t->io_min & (t->physical_block_size - 1)) { t->io_min = t->physical_block_size; t->misaligned = 1; ret = -1; } /* Optimal I/O a multiple of the physical block size? */ if (t->io_opt & (t->physical_block_size - 1)) { t->io_opt = 0; t->misaligned = 1; ret = -1; } /* chunk_sectors a multiple of the physical block size? */ if ((t->chunk_sectors << 9) & (t->physical_block_size - 1)) { t->chunk_sectors = 0; t->misaligned = 1; ret = -1; } t->raid_partial_stripes_expensive = max(t->raid_partial_stripes_expensive, b->raid_partial_stripes_expensive); /* Find lowest common alignment_offset */ t->alignment_offset = lcm_not_zero(t->alignment_offset, alignment) % max(t->physical_block_size, t->io_min); /* Verify that new alignment_offset is on a logical block boundary */ if (t->alignment_offset & (t->logical_block_size - 1)) { t->misaligned = 1; ret = -1; } t->max_sectors = blk_round_down_sectors(t->max_sectors, t->logical_block_size); t->max_hw_sectors = blk_round_down_sectors(t->max_hw_sectors, t->logical_block_size); t->max_dev_sectors = blk_round_down_sectors(t->max_dev_sectors, t->logical_block_size); /* Discard alignment and granularity */ if (b->discard_granularity) { alignment = queue_limit_discard_alignment(b, start); if (t->discard_granularity != 0 && t->discard_alignment != alignment) { top = t->discard_granularity + t->discard_alignment; bottom = b->discard_granularity + alignment; /* Verify that top and bottom intervals line up */ if ((max(top, bottom) % min(top, bottom)) != 0) t->discard_misaligned = 1; } t->max_discard_sectors = min_not_zero(t->max_discard_sectors, b->max_discard_sectors); t->max_hw_discard_sectors = min_not_zero(t->max_hw_discard_sectors, b->max_hw_discard_sectors); t->discard_granularity = max(t->discard_granularity, b->discard_granularity); t->discard_alignment = lcm_not_zero(t->discard_alignment, alignment) % t->discard_granularity; } t->max_secure_erase_sectors = min_not_zero(t->max_secure_erase_sectors, b->max_secure_erase_sectors); t->zone_write_granularity = max(t->zone_write_granularity, b->zone_write_granularity); t->zoned = max(t->zoned, b->zoned); if (!t->zoned) { t->zone_write_granularity = 0; t->max_zone_append_sectors = 0; } return ret; } EXPORT_SYMBOL(blk_stack_limits); /** * disk_stack_limits - adjust queue limits for stacked drivers * @disk: MD/DM gendisk (top) * @bdev: the underlying block device (bottom) * @offset: offset to beginning of data within component device * * Description: * Merges the limits for a top level gendisk and a bottom level * block_device. */ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, sector_t offset) { struct request_queue *t = disk->queue; if (blk_stack_limits(&t->limits, &bdev_get_queue(bdev)->limits, get_start_sect(bdev) + (offset >> 9)) < 0) pr_notice("%s: Warning: Device %pg is misaligned\n", disk->disk_name, bdev); disk_update_readahead(disk); } EXPORT_SYMBOL(disk_stack_limits); /** * blk_queue_update_dma_pad - update pad mask * @q: the request queue for the device * @mask: pad mask * * Update dma pad mask. * * Appending pad buffer to a request modifies the last entry of a * scatter list such that it includes the pad buffer. **/ void blk_queue_update_dma_pad(struct request_queue *q, unsigned int mask) { if (mask > q->dma_pad_mask) q->dma_pad_mask = mask; } EXPORT_SYMBOL(blk_queue_update_dma_pad); /** * blk_queue_segment_boundary - set boundary rules for segment merging * @q: the request queue for the device * @mask: the memory boundary mask **/ void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask) { if (mask < PAGE_SIZE - 1) { mask = PAGE_SIZE - 1; pr_info("set to minimum %lx\n", mask); } q->limits.seg_boundary_mask = mask; } EXPORT_SYMBOL(blk_queue_segment_boundary); /** * blk_queue_virt_boundary - set boundary rules for bio merging * @q: the request queue for the device * @mask: the memory boundary mask **/ void blk_queue_virt_boundary(struct request_queue *q, unsigned long mask) { q->limits.virt_boundary_mask = mask; /* * Devices that require a virtual boundary do not support scatter/gather * I/O natively, but instead require a descriptor list entry for each * page (which might not be idential to the Linux PAGE_SIZE). Because * of that they are not limited by our notion of "segment size". */ if (mask) q->limits.max_segment_size = UINT_MAX; } EXPORT_SYMBOL(blk_queue_virt_boundary); /** * blk_queue_dma_alignment - set dma length and memory alignment * @q: the request queue for the device * @mask: alignment mask * * description: * set required memory and length alignment for direct dma transactions. * this is used when building direct io requests for the queue. * **/ void blk_queue_dma_alignment(struct request_queue *q, int mask) { q->limits.dma_alignment = mask; } EXPORT_SYMBOL(blk_queue_dma_alignment); /** * blk_queue_update_dma_alignment - update dma length and memory alignment * @q: the request queue for the device * @mask: alignment mask * * description: * update required memory and length alignment for direct dma transactions. * If the requested alignment is larger than the current alignment, then * the current queue alignment is updated to the new value, otherwise it * is left alone. The design of this is to allow multiple objects * (driver, device, transport etc) to set their respective * alignments without having them interfere. * **/ void blk_queue_update_dma_alignment(struct request_queue *q, int mask) { BUG_ON(mask > PAGE_SIZE); if (mask > q->limits.dma_alignment) q->limits.dma_alignment = mask; } EXPORT_SYMBOL(blk_queue_update_dma_alignment); /** * blk_set_queue_depth - tell the block layer about the device queue depth * @q: the request queue for the device * @depth: queue depth * */ void blk_set_queue_depth(struct request_queue *q, unsigned int depth) { q->queue_depth = depth; rq_qos_queue_depth_changed(q); } EXPORT_SYMBOL(blk_set_queue_depth); /** * blk_queue_write_cache - configure queue's write cache * @q: the request queue for the device * @wc: write back cache on or off * @fua: device supports FUA writes, if true * * Tell the block layer about the write cache of @q. */ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) { if (wc) { blk_queue_flag_set(QUEUE_FLAG_HW_WC, q); blk_queue_flag_set(QUEUE_FLAG_WC, q); } else { blk_queue_flag_clear(QUEUE_FLAG_HW_WC, q); blk_queue_flag_clear(QUEUE_FLAG_WC, q); } if (fua) blk_queue_flag_set(QUEUE_FLAG_FUA, q); else blk_queue_flag_clear(QUEUE_FLAG_FUA, q); wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); } EXPORT_SYMBOL_GPL(blk_queue_write_cache); /** * blk_queue_required_elevator_features - Set a queue required elevator features * @q: the request queue for the target device * @features: Required elevator features OR'ed together * * Tell the block layer that for the device controlled through @q, only the * only elevators that can be used are those that implement at least the set of * features specified by @features. */ void blk_queue_required_elevator_features(struct request_queue *q, unsigned int features) { q->required_elevator_features = features; } EXPORT_SYMBOL_GPL(blk_queue_required_elevator_features); /** * blk_queue_can_use_dma_map_merging - configure queue for merging segments. * @q: the request queue for the device * @dev: the device pointer for dma * * Tell the block layer about merging the segments by dma map of @q. */ bool blk_queue_can_use_dma_map_merging(struct request_queue *q, struct device *dev) { unsigned long boundary = dma_get_merge_boundary(dev); if (!boundary) return false; /* No need to update max_segment_size. see blk_queue_virt_boundary() */ blk_queue_virt_boundary(q, boundary); return true; } EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging); static bool disk_has_partitions(struct gendisk *disk) { unsigned long idx; struct block_device *part; bool ret = false; rcu_read_lock(); xa_for_each(&disk->part_tbl, idx, part) { if (bdev_is_partition(part)) { ret = true; break; } } rcu_read_unlock(); return ret; } /** * disk_set_zoned - configure the zoned model for a disk * @disk: the gendisk of the queue to configure * @model: the zoned model to set * * Set the zoned model of @disk to @model. * * When @model is BLK_ZONED_HM (host managed), this should be called only * if zoned block device support is enabled (CONFIG_BLK_DEV_ZONED option). * If @model specifies BLK_ZONED_HA (host aware), the effective model used * depends on CONFIG_BLK_DEV_ZONED settings and on the existence of partitions * on the disk. */ void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model) { struct request_queue *q = disk->queue; unsigned int old_model = q->limits.zoned; switch (model) { case BLK_ZONED_HM: /* * Host managed devices are supported only if * CONFIG_BLK_DEV_ZONED is enabled. */ WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED)); break; case BLK_ZONED_HA: /* * Host aware devices can be treated either as regular block * devices (similar to drive managed devices) or as zoned block * devices to take advantage of the zone command set, similarly * to host managed devices. We try the latter if there are no * partitions and zoned block device support is enabled, else * we do nothing special as far as the block layer is concerned. */ if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || disk_has_partitions(disk)) model = BLK_ZONED_NONE; break; case BLK_ZONED_NONE: default: if (WARN_ON_ONCE(model != BLK_ZONED_NONE)) model = BLK_ZONED_NONE; break; } q->limits.zoned = model; if (model != BLK_ZONED_NONE) { /* * Set the zone write granularity to the device logical block * size by default. The driver can change this value if needed. */ blk_queue_zone_write_granularity(q, queue_logical_block_size(q)); } else if (old_model != BLK_ZONED_NONE) { disk_clear_zone_settings(disk); } } EXPORT_SYMBOL_GPL(disk_set_zoned); int bdev_alignment_offset(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); if (q->limits.misaligned) return -1; if (bdev_is_partition(bdev)) return queue_limit_alignment_offset(&q->limits, bdev->bd_start_sect); return q->limits.alignment_offset; } EXPORT_SYMBOL_GPL(bdev_alignment_offset); unsigned int bdev_discard_alignment(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); if (bdev_is_partition(bdev)) return queue_limit_discard_alignment(&q->limits, bdev->bd_start_sect); return q->limits.discard_alignment; } EXPORT_SYMBOL_GPL(bdev_discard_alignment); |
83 85 85 85 50 50 50 50 3 3 15 63 63 49 15 15 25 88 88 25 10 88 65 25 25 25 122 10 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 | // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/mount.c - kernfs mount implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/fs.h> #include <linux/mount.h> #include <linux/init.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/namei.h> #include <linux/seq_file.h> #include <linux/exportfs.h> #include "kernfs-internal.h" struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; struct kernfs_global_locks *kernfs_locks; static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) { struct kernfs_root *root = kernfs_root(kernfs_dentry_node(dentry)); struct kernfs_syscall_ops *scops = root->syscall_ops; if (scops && scops->show_options) return scops->show_options(sf, root); return 0; } static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry) { struct kernfs_node *node = kernfs_dentry_node(dentry); struct kernfs_root *root = kernfs_root(node); struct kernfs_syscall_ops *scops = root->syscall_ops; if (scops && scops->show_path) return scops->show_path(sf, node, root); seq_dentry(sf, dentry, " \t\n\\"); return 0; } const struct super_operations kernfs_sops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .evict_inode = kernfs_evict_inode, .show_options = kernfs_sop_show_options, .show_path = kernfs_sop_show_path, }; static int kernfs_encode_fh(struct inode *inode, __u32 *fh, int *max_len, struct inode *parent) { struct kernfs_node *kn = inode->i_private; if (*max_len < 2) { *max_len = 2; return FILEID_INVALID; } *max_len = 2; *(u64 *)fh = kn->id; return FILEID_KERNFS; } static struct dentry *__kernfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type, bool get_parent) { struct kernfs_super_info *info = kernfs_info(sb); struct kernfs_node *kn; struct inode *inode; u64 id; if (fh_len < 2) return NULL; switch (fh_type) { case FILEID_KERNFS: id = *(u64 *)fid; break; case FILEID_INO32_GEN: case FILEID_INO32_GEN_PARENT: /* * blk_log_action() exposes "LOW32,HIGH32" pair without * type and userland can call us with generic fid * constructed from them. Combine it back to ID. See * blk_log_action(). */ id = ((u64)fid->i32.gen << 32) | fid->i32.ino; break; default: return NULL; } kn = kernfs_find_and_get_node_by_id(info->root, id); if (!kn) return ERR_PTR(-ESTALE); if (get_parent) { struct kernfs_node *parent; parent = kernfs_get_parent(kn); kernfs_put(kn); kn = parent; if (!kn) return ERR_PTR(-ESTALE); } inode = kernfs_get_inode(sb, kn); kernfs_put(kn); if (!inode) return ERR_PTR(-ESTALE); return d_obtain_alias(inode); } static struct dentry *kernfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return __kernfs_fh_to_dentry(sb, fid, fh_len, fh_type, false); } static struct dentry *kernfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return __kernfs_fh_to_dentry(sb, fid, fh_len, fh_type, true); } static struct dentry *kernfs_get_parent_dentry(struct dentry *child) { struct kernfs_node *kn = kernfs_dentry_node(child); return d_obtain_alias(kernfs_get_inode(child->d_sb, kn->parent)); } static const struct export_operations kernfs_export_ops = { .encode_fh = kernfs_encode_fh, .fh_to_dentry = kernfs_fh_to_dentry, .fh_to_parent = kernfs_fh_to_parent, .get_parent = kernfs_get_parent_dentry, }; /** * kernfs_root_from_sb - determine kernfs_root associated with a super_block * @sb: the super_block in question * * Return: the kernfs_root associated with @sb. If @sb is not a kernfs one, * %NULL is returned. */ struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) { if (sb->s_op == &kernfs_sops) return kernfs_info(sb)->root; return NULL; } /* * find the next ancestor in the path down to @child, where @parent was the * ancestor whose descendant we want to find. * * Say the path is /a/b/c/d. @child is d, @parent is %NULL. We return the root * node. If @parent is b, then we return the node for c. * Passing in d as @parent is not ok. */ static struct kernfs_node *find_next_ancestor(struct kernfs_node *child, struct kernfs_node *parent) { if (child == parent) { pr_crit_once("BUG in find_next_ancestor: called with parent == child"); return NULL; } while (child->parent != parent) { if (!child->parent) return NULL; child = child->parent; } return child; } /** * kernfs_node_dentry - get a dentry for the given kernfs_node * @kn: kernfs_node for which a dentry is needed * @sb: the kernfs super_block * * Return: the dentry pointer */ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, struct super_block *sb) { struct dentry *dentry; struct kernfs_node *knparent = NULL; BUG_ON(sb->s_op != &kernfs_sops); dentry = dget(sb->s_root); /* Check if this is the root kernfs_node */ if (!kn->parent) return dentry; knparent = find_next_ancestor(kn, NULL); if (WARN_ON(!knparent)) { dput(dentry); return ERR_PTR(-EINVAL); } do { struct dentry *dtmp; struct kernfs_node *kntmp; if (kn == knparent) return dentry; kntmp = find_next_ancestor(kn, knparent); if (WARN_ON(!kntmp)) { dput(dentry); return ERR_PTR(-EINVAL); } dtmp = lookup_positive_unlocked(kntmp->name, dentry, strlen(kntmp->name)); dput(dentry); if (IS_ERR(dtmp)) return dtmp; knparent = kntmp; dentry = dtmp; } while (true); } static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *kfc) { struct kernfs_super_info *info = kernfs_info(sb); struct kernfs_root *kf_root = kfc->root; struct inode *inode; struct dentry *root; info->sb = sb; /* Userspace would break if executables or devices appear on sysfs */ sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = kfc->magic; sb->s_op = &kernfs_sops; sb->s_xattr = kernfs_xattr_handlers; if (info->root->flags & KERNFS_ROOT_SUPPORT_EXPORTOP) sb->s_export_op = &kernfs_export_ops; sb->s_time_gran = 1; /* sysfs dentries and inodes don't require IO to create */ sb->s_shrink.seeks = 0; /* get root inode, initialize and unlock it */ down_read(&kf_root->kernfs_rwsem); inode = kernfs_get_inode(sb, info->root->kn); up_read(&kf_root->kernfs_rwsem); if (!inode) { pr_debug("kernfs: could not get root inode\n"); return -ENOMEM; } /* instantiate and link root dentry */ root = d_make_root(inode); if (!root) { pr_debug("%s: could not get root dentry!\n", __func__); return -ENOMEM; } sb->s_root = root; sb->s_d_op = &kernfs_dops; return 0; } static int kernfs_test_super(struct super_block *sb, struct fs_context *fc) { struct kernfs_super_info *sb_info = kernfs_info(sb); struct kernfs_super_info *info = fc->s_fs_info; return sb_info->root == info->root && sb_info->ns == info->ns; } static int kernfs_set_super(struct super_block *sb, struct fs_context *fc) { struct kernfs_fs_context *kfc = fc->fs_private; kfc->ns_tag = NULL; return set_anon_super_fc(sb, fc); } /** * kernfs_super_ns - determine the namespace tag of a kernfs super_block * @sb: super_block of interest * * Return: the namespace tag associated with kernfs super_block @sb. */ const void *kernfs_super_ns(struct super_block *sb) { struct kernfs_super_info *info = kernfs_info(sb); return info->ns; } /** * kernfs_get_tree - kernfs filesystem access/retrieval helper * @fc: The filesystem context. * * This is to be called from each kernfs user's fs_context->ops->get_tree() * implementation, which should set the specified ->@fs_type and ->@flags, and * specify the hierarchy and namespace tag to mount via ->@root and ->@ns, * respectively. * * Return: %0 on success, -errno on failure. */ int kernfs_get_tree(struct fs_context *fc) { struct kernfs_fs_context *kfc = fc->fs_private; struct super_block *sb; struct kernfs_super_info *info; int error; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; info->root = kfc->root; info->ns = kfc->ns_tag; INIT_LIST_HEAD(&info->node); fc->s_fs_info = info; sb = sget_fc(fc, kernfs_test_super, kernfs_set_super); if (IS_ERR(sb)) return PTR_ERR(sb); if (!sb->s_root) { struct kernfs_super_info *info = kernfs_info(sb); struct kernfs_root *root = kfc->root; kfc->new_sb_created = true; error = kernfs_fill_super(sb, kfc); if (error) { deactivate_locked_super(sb); return error; } sb->s_flags |= SB_ACTIVE; down_write(&root->kernfs_rwsem); list_add(&info->node, &info->root->supers); up_write(&root->kernfs_rwsem); } fc->root = dget(sb->s_root); return 0; } void kernfs_free_fs_context(struct fs_context *fc) { /* Note that we don't deal with kfc->ns_tag here. */ kfree(fc->s_fs_info); fc->s_fs_info = NULL; } /** * kernfs_kill_sb - kill_sb for kernfs * @sb: super_block being killed * * This can be used directly for file_system_type->kill_sb(). If a kernfs * user needs extra cleanup, it can implement its own kill_sb() and call * this function at the end. */ void kernfs_kill_sb(struct super_block *sb) { struct kernfs_super_info *info = kernfs_info(sb); struct kernfs_root *root = info->root; down_write(&root->kernfs_rwsem); list_del(&info->node); up_write(&root->kernfs_rwsem); /* * Remove the superblock from fs_supers/s_instances * so we can't find it, before freeing kernfs_super_info. */ kill_anon_super(sb); kfree(info); } static void __init kernfs_mutex_init(void) { int count; for (count = 0; count < NR_KERNFS_LOCKS; count++) mutex_init(&kernfs_locks->open_file_mutex[count]); } static void __init kernfs_lock_init(void) { kernfs_locks = kmalloc(sizeof(struct kernfs_global_locks), GFP_KERNEL); WARN_ON(!kernfs_locks); kernfs_mutex_init(); } void __init kernfs_init(void) { kernfs_node_cache = kmem_cache_create("kernfs_node_cache", sizeof(struct kernfs_node), 0, SLAB_PANIC, NULL); /* Creates slab cache for kernfs inode attributes */ kernfs_iattrs_cache = kmem_cache_create("kernfs_iattrs_cache", sizeof(struct kernfs_iattrs), 0, SLAB_PANIC, NULL); kernfs_lock_init(); } |
335 37 335 335 37 335 335 901 571 5 335 653 56 652 1 1 7 6 6 5 1 6 25 2 23 23 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 | // SPDX-License-Identifier: GPL-2.0-only /* * Pid namespaces * * Authors: * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc. * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM * Many thanks to Oleg Nesterov for comments and help * */ #include <linux/pid.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/syscalls.h> #include <linux/cred.h> #include <linux/err.h> #include <linux/acct.h> #include <linux/slab.h> #include <linux/proc_ns.h> #include <linux/reboot.h> #include <linux/export.h> #include <linux/sched/task.h> #include <linux/sched/signal.h> #include <linux/idr.h> static DEFINE_MUTEX(pid_caches_mutex); static struct kmem_cache *pid_ns_cachep; /* Write once array, filled from the beginning. */ static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL]; /* * creates the kmem cache to allocate pids from. * @level: pid namespace level */ static struct kmem_cache *create_pid_cachep(unsigned int level) { /* Level 0 is init_pid_ns.pid_cachep */ struct kmem_cache **pkc = &pid_cache[level - 1]; struct kmem_cache *kc; char name[4 + 10 + 1]; unsigned int len; kc = READ_ONCE(*pkc); if (kc) return kc; snprintf(name, sizeof(name), "pid_%u", level + 1); len = sizeof(struct pid) + level * sizeof(struct upid); mutex_lock(&pid_caches_mutex); /* Name collision forces to do allocation under mutex. */ if (!*pkc) *pkc = kmem_cache_create(name, len, 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); mutex_unlock(&pid_caches_mutex); /* current can fail, but someone else can succeed. */ return READ_ONCE(*pkc); } static struct ucounts *inc_pid_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES); } static void dec_pid_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_PID_NAMESPACES); } static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, struct pid_namespace *parent_pid_ns) { struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; struct ucounts *ucounts; int err; err = -EINVAL; if (!in_userns(parent_pid_ns->user_ns, user_ns)) goto out; err = -ENOSPC; if (level > MAX_PID_NS_LEVEL) goto out; ucounts = inc_pid_namespaces(user_ns); if (!ucounts) goto out; err = -ENOMEM; ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); if (ns == NULL) goto out_dec; idr_init(&ns->idr); ns->pid_cachep = create_pid_cachep(level); if (ns->pid_cachep == NULL) goto out_free_idr; err = ns_alloc_inum(&ns->ns); if (err) goto out_free_idr; ns->ns.ops = &pidns_operations; refcount_set(&ns->ns.count, 1); ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; ns->pid_allocated = PIDNS_ADDING; return ns; out_free_idr: idr_destroy(&ns->idr); kmem_cache_free(pid_ns_cachep, ns); out_dec: dec_pid_namespaces(ucounts); out: return ERR_PTR(err); } static void delayed_free_pidns(struct rcu_head *p) { struct pid_namespace *ns = container_of(p, struct pid_namespace, rcu); dec_pid_namespaces(ns->ucounts); put_user_ns(ns->user_ns); kmem_cache_free(pid_ns_cachep, ns); } static void destroy_pid_namespace(struct pid_namespace *ns) { ns_free_inum(&ns->ns); idr_destroy(&ns->idr); call_rcu(&ns->rcu, delayed_free_pidns); } struct pid_namespace *copy_pid_ns(unsigned long flags, struct user_namespace *user_ns, struct pid_namespace *old_ns) { if (!(flags & CLONE_NEWPID)) return get_pid_ns(old_ns); if (task_active_pid_ns(current) != old_ns) return ERR_PTR(-EINVAL); return create_pid_namespace(user_ns, old_ns); } void put_pid_ns(struct pid_namespace *ns) { struct pid_namespace *parent; while (ns != &init_pid_ns) { parent = ns->parent; if (!refcount_dec_and_test(&ns->ns.count)) break; destroy_pid_namespace(ns); ns = parent; } } EXPORT_SYMBOL_GPL(put_pid_ns); void zap_pid_ns_processes(struct pid_namespace *pid_ns) { int nr; int rc; struct task_struct *task, *me = current; int init_pids = thread_group_leader(me) ? 1 : 2; struct pid *pid; /* Don't allow any more processes into the pid namespace */ disable_pid_allocation(pid_ns); /* * Ignore SIGCHLD causing any terminated children to autoreap. * This speeds up the namespace shutdown, plus see the comment * below. */ spin_lock_irq(&me->sighand->siglock); me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; spin_unlock_irq(&me->sighand->siglock); /* * The last thread in the cgroup-init thread group is terminating. * Find remaining pid_ts in the namespace, signal and wait for them * to exit. * * Note: This signals each threads in the namespace - even those that * belong to the same thread group, To avoid this, we would have * to walk the entire tasklist looking a processes in this * namespace, but that could be unnecessarily expensive if the * pid namespace has just a few processes. Or we need to * maintain a tasklist for each pid namespace. * */ rcu_read_lock(); read_lock(&tasklist_lock); nr = 2; idr_for_each_entry_continue(&pid_ns->idr, pid, nr) { task = pid_task(pid, PIDTYPE_PID); if (task && !__fatal_signal_pending(task)) group_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_MAX); } read_unlock(&tasklist_lock); rcu_read_unlock(); /* * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD. * kernel_wait4() will also block until our children traced from the * parent namespace are detached and become EXIT_DEAD. */ do { clear_thread_flag(TIF_SIGPENDING); clear_thread_flag(TIF_NOTIFY_SIGNAL); rc = kernel_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); /* * kernel_wait4() misses EXIT_DEAD children, and EXIT_ZOMBIE * process whose parents processes are outside of the pid * namespace. Such processes are created with setns()+fork(). * * If those EXIT_ZOMBIE processes are not reaped by their * parents before their parents exit, they will be reparented * to pid_ns->child_reaper. Thus pidns->child_reaper needs to * stay valid until they all go away. * * The code relies on the pid_ns->child_reaper ignoring * SIGCHILD to cause those EXIT_ZOMBIE processes to be * autoreaped if reparented. * * Semantically it is also desirable to wait for EXIT_ZOMBIE * processes before allowing the child_reaper to be reaped, as * that gives the invariant that when the init process of a * pid namespace is reaped all of the processes in the pid * namespace are gone. * * Once all of the other tasks are gone from the pid_namespace * free_pid() will awaken this task. */ for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (pid_ns->pid_allocated == init_pids) break; /* * Release tasks_rcu_exit_srcu to avoid following deadlock: * * 1) TASK A unshare(CLONE_NEWPID) * 2) TASK A fork() twice -> TASK B (child reaper for new ns) * and TASK C * 3) TASK B exits, kills TASK C, waits for TASK A to reap it * 4) TASK A calls synchronize_rcu_tasks() * -> synchronize_srcu(tasks_rcu_exit_srcu) * 5) *DEADLOCK* * * It is considered safe to release tasks_rcu_exit_srcu here * because we assume the current task can not be concurrently * reaped at this point. */ exit_tasks_rcu_stop(); schedule(); exit_tasks_rcu_start(); } __set_current_state(TASK_RUNNING); if (pid_ns->reboot) current->signal->group_exit_code = pid_ns->reboot; acct_exit_ns(pid_ns); return; } #ifdef CONFIG_CHECKPOINT_RESTORE static int pid_ns_ctl_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct pid_namespace *pid_ns = task_active_pid_ns(current); struct ctl_table tmp = *table; int ret, next; if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns)) return -EPERM; /* * Writing directly to ns' last_pid field is OK, since this field * is volatile in a living namespace anyway and a code writing to * it should synchronize its usage with external means. */ next = idr_get_cursor(&pid_ns->idr) - 1; tmp.data = &next; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (!ret && write) idr_set_cursor(&pid_ns->idr, next + 1); return ret; } extern int pid_max; static struct ctl_table pid_ns_ctl_table[] = { { .procname = "ns_last_pid", .maxlen = sizeof(int), .mode = 0666, /* permissions are checked in the handler */ .proc_handler = pid_ns_ctl_handler, .extra1 = SYSCTL_ZERO, .extra2 = &pid_max, }, { } }; static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; #endif /* CONFIG_CHECKPOINT_RESTORE */ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) { if (pid_ns == &init_pid_ns) return 0; switch (cmd) { case LINUX_REBOOT_CMD_RESTART2: case LINUX_REBOOT_CMD_RESTART: pid_ns->reboot = SIGHUP; break; case LINUX_REBOOT_CMD_POWER_OFF: case LINUX_REBOOT_CMD_HALT: pid_ns->reboot = SIGINT; break; default: return -EINVAL; } read_lock(&tasklist_lock); send_sig(SIGKILL, pid_ns->child_reaper, 1); read_unlock(&tasklist_lock); do_exit(0); /* Not reached */ return 0; } static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) { return container_of(ns, struct pid_namespace, ns); } static struct ns_common *pidns_get(struct task_struct *task) { struct pid_namespace *ns; rcu_read_lock(); ns = task_active_pid_ns(task); if (ns) get_pid_ns(ns); rcu_read_unlock(); return ns ? &ns->ns : NULL; } static struct ns_common *pidns_for_children_get(struct task_struct *task) { struct pid_namespace *ns = NULL; task_lock(task); if (task->nsproxy) { ns = task->nsproxy->pid_ns_for_children; get_pid_ns(ns); } task_unlock(task); if (ns) { read_lock(&tasklist_lock); if (!ns->child_reaper) { put_pid_ns(ns); ns = NULL; } read_unlock(&tasklist_lock); } return ns ? &ns->ns : NULL; } static void pidns_put(struct ns_common *ns) { put_pid_ns(to_pid_ns(ns)); } static int pidns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct pid_namespace *active = task_active_pid_ns(current); struct pid_namespace *ancestor, *new = to_pid_ns(ns); if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; /* * Only allow entering the current active pid namespace * or a child of the current active pid namespace. * * This is required for fork to return a usable pid value and * this maintains the property that processes and their * children can not escape their current pid namespace. */ if (new->level < active->level) return -EINVAL; ancestor = new; while (ancestor->level > active->level) ancestor = ancestor->parent; if (ancestor != active) return -EINVAL; put_pid_ns(nsproxy->pid_ns_for_children); nsproxy->pid_ns_for_children = get_pid_ns(new); return 0; } static struct ns_common *pidns_get_parent(struct ns_common *ns) { struct pid_namespace *active = task_active_pid_ns(current); struct pid_namespace *pid_ns, *p; /* See if the parent is in the current namespace */ pid_ns = p = to_pid_ns(ns)->parent; for (;;) { if (!p) return ERR_PTR(-EPERM); if (p == active) break; p = p->parent; } return &get_pid_ns(pid_ns)->ns; } static struct user_namespace *pidns_owner(struct ns_common *ns) { return to_pid_ns(ns)->user_ns; } const struct proc_ns_operations pidns_operations = { .name = "pid", .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, .owner = pidns_owner, .get_parent = pidns_get_parent, }; const struct proc_ns_operations pidns_for_children_operations = { .name = "pid_for_children", .real_ns_name = "pid", .type = CLONE_NEWPID, .get = pidns_for_children_get, .put = pidns_put, .install = pidns_install, .owner = pidns_owner, .get_parent = pidns_get_parent, }; static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT); #ifdef CONFIG_CHECKPOINT_RESTORE register_sysctl_paths(kern_path, pid_ns_ctl_table); #endif return 0; } __initcall(pid_namespaces_init); |
557 556 572 573 342 25 18 319 337 21 315 316 337 338 207 36 223 20 101 4 1009 1008 1 1 1 1 356 20 57 20 453 22 337 26 108 337 452 346 105 209 243 394 57 55 35 35 1017 1017 1 14 14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 | /* * Ext4 orphan inode handling */ #include <linux/fs.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include "ext4.h" #include "ext4_jbd2.h" static int ext4_orphan_file_add(handle_t *handle, struct inode *inode) { int i, j, start; struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info; int ret = 0; bool found = false; __le32 *bdata; int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb); int looped = 0; /* * Find block with free orphan entry. Use CPU number for a naive hash * for a search start in the orphan file */ start = raw_smp_processor_id()*13 % oi->of_blocks; i = start; do { if (atomic_dec_if_positive(&oi->of_binfo[i].ob_free_entries) >= 0) { found = true; break; } if (++i >= oi->of_blocks) i = 0; } while (i != start); if (!found) { /* * For now we don't grow or shrink orphan file. We just use * whatever was allocated at mke2fs time. The additional * credits we would have to reserve for each orphan inode * operation just don't seem worth it. */ return -ENOSPC; } ret = ext4_journal_get_write_access(handle, inode->i_sb, oi->of_binfo[i].ob_bh, EXT4_JTR_ORPHAN_FILE); if (ret) { atomic_inc(&oi->of_binfo[i].ob_free_entries); return ret; } bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); /* Find empty slot in a block */ j = 0; do { if (looped) { /* * Did we walk through the block several times without * finding free entry? It is theoretically possible * if entries get constantly allocated and freed or * if the block is corrupted. Avoid indefinite looping * and bail. We'll use orphan list instead. */ if (looped > 3) { atomic_inc(&oi->of_binfo[i].ob_free_entries); return -ENOSPC; } cond_resched(); } while (bdata[j]) { if (++j >= inodes_per_ob) { j = 0; looped++; } } } while (cmpxchg(&bdata[j], (__le32)0, cpu_to_le32(inode->i_ino)) != (__le32)0); EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j; ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE); return ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[i].ob_bh); } /* * ext4_orphan_add() links an unlinked or truncated inode into a list of * such inodes, starting at the superblock, in case we crash before the * file is closed/deleted, or in case the inode truncate spans multiple * transactions and the last transaction is not recovered after a crash. * * At filesystem recovery time, we walk this list deleting unlinked * inodes and truncating linked inodes in ext4_orphan_cleanup(). * * Orphan list manipulation functions must be called under i_rwsem unless * we are just creating the inode or deleting it. */ int ext4_orphan_add(handle_t *handle, struct inode *inode) { struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_iloc iloc; int err = 0, rc; bool dirty = false; if (!sbi->s_journal || is_bad_inode(inode)) return 0; WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); /* * Inode orphaned in orphan file or in orphan list? */ if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) || !list_empty(&EXT4_I(inode)->i_orphan)) return 0; /* * Orphan handling is only valid for files with data blocks * being truncated, or files being unlinked. Note that we either * hold i_rwsem, or the inode can not be referenced from outside, * so i_nlink should not be bumped due to race */ ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); if (sbi->s_orphan_info.of_blocks) { err = ext4_orphan_file_add(handle, inode); /* * Fallback to normal orphan list of orphan file is * out of space */ if (err != -ENOSPC) return err; } BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, EXT4_JTR_NONE); if (err) goto out; err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) goto out; mutex_lock(&sbi->s_orphan_lock); /* * Due to previous errors inode may be already a part of on-disk * orphan list. If so skip on-disk list modification. */ if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) > (le32_to_cpu(sbi->s_es->s_inodes_count))) { /* Insert this inode at the head of the on-disk orphan list */ NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan); lock_buffer(sbi->s_sbh); sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); dirty = true; } list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan); mutex_unlock(&sbi->s_orphan_lock); if (dirty) { err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); rc = ext4_mark_iloc_dirty(handle, inode, &iloc); if (!err) err = rc; if (err) { /* * We have to remove inode from in-memory list if * addition to on disk orphan list failed. Stray orphan * list entries can cause panics at unmount time. */ mutex_lock(&sbi->s_orphan_lock); list_del_init(&EXT4_I(inode)->i_orphan); mutex_unlock(&sbi->s_orphan_lock); } } else brelse(iloc.bh); ext4_debug("superblock will point to %lu\n", inode->i_ino); ext4_debug("orphan inode %lu will point to %d\n", inode->i_ino, NEXT_ORPHAN(inode)); out: ext4_std_error(sb, err); return err; } static int ext4_orphan_file_del(handle_t *handle, struct inode *inode) { struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info; __le32 *bdata; int blk, off; int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb); int ret = 0; if (!handle) goto out; blk = EXT4_I(inode)->i_orphan_idx / inodes_per_ob; off = EXT4_I(inode)->i_orphan_idx % inodes_per_ob; if (WARN_ON_ONCE(blk >= oi->of_blocks)) goto out; ret = ext4_journal_get_write_access(handle, inode->i_sb, oi->of_binfo[blk].ob_bh, EXT4_JTR_ORPHAN_FILE); if (ret) goto out; bdata = (__le32 *)(oi->of_binfo[blk].ob_bh->b_data); bdata[off] = 0; atomic_inc(&oi->of_binfo[blk].ob_free_entries); ret = ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[blk].ob_bh); out: ext4_clear_inode_state(inode, EXT4_STATE_ORPHAN_FILE); INIT_LIST_HEAD(&EXT4_I(inode)->i_orphan); return ret; } /* * ext4_orphan_del() removes an unlinked or truncated inode from the list * of such inodes stored on disk, because it is finally being cleaned up. */ int ext4_orphan_del(handle_t *handle, struct inode *inode) { struct list_head *prev; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 ino_next; struct ext4_iloc iloc; int err = 0; if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS)) return 0; WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE)) return ext4_orphan_file_del(handle, inode); /* Do this quick check before taking global s_orphan_lock. */ if (list_empty(&ei->i_orphan)) return 0; if (handle) { /* Grab inode buffer early before taking global s_orphan_lock */ err = ext4_reserve_inode_write(handle, inode, &iloc); } mutex_lock(&sbi->s_orphan_lock); ext4_debug("remove inode %lu from orphan list\n", inode->i_ino); prev = ei->i_orphan.prev; list_del_init(&ei->i_orphan); /* If we're on an error path, we may not have a valid * transaction handle with which to update the orphan list on * disk, but we still need to remove the inode from the linked * list in memory. */ if (!handle || err) { mutex_unlock(&sbi->s_orphan_lock); goto out_err; } ino_next = NEXT_ORPHAN(inode); if (prev == &sbi->s_orphan) { ext4_debug("superblock will point to %u\n", ino_next); BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, sbi->s_sbh, EXT4_JTR_NONE); if (err) { mutex_unlock(&sbi->s_orphan_lock); goto out_brelse; } lock_buffer(sbi->s_sbh); sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); ext4_superblock_csum_set(inode->i_sb); unlock_buffer(sbi->s_sbh); mutex_unlock(&sbi->s_orphan_lock); err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); } else { struct ext4_iloc iloc2; struct inode *i_prev = &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; ext4_debug("orphan inode %lu will point to %u\n", i_prev->i_ino, ino_next); err = ext4_reserve_inode_write(handle, i_prev, &iloc2); if (err) { mutex_unlock(&sbi->s_orphan_lock); goto out_brelse; } NEXT_ORPHAN(i_prev) = ino_next; err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2); mutex_unlock(&sbi->s_orphan_lock); } if (err) goto out_brelse; NEXT_ORPHAN(inode) = 0; err = ext4_mark_iloc_dirty(handle, inode, &iloc); out_err: ext4_std_error(inode->i_sb, err); return err; out_brelse: brelse(iloc.bh); goto out_err; } #ifdef CONFIG_QUOTA static int ext4_quota_on_mount(struct super_block *sb, int type) { return dquot_quota_on_mount(sb, rcu_dereference_protected(EXT4_SB(sb)->s_qf_names[type], lockdep_is_held(&sb->s_umount)), EXT4_SB(sb)->s_jquota_fmt, type); } #endif static void ext4_process_orphan(struct inode *inode, int *nr_truncates, int *nr_orphans) { struct super_block *sb = inode->i_sb; int ret; dquot_initialize(inode); if (inode->i_nlink) { if (test_opt(sb, DEBUG)) ext4_msg(sb, KERN_DEBUG, "%s: truncating inode %lu to %lld bytes", __func__, inode->i_ino, inode->i_size); ext4_debug("truncating inode %lu to %lld bytes\n", inode->i_ino, inode->i_size); inode_lock(inode); truncate_inode_pages(inode->i_mapping, inode->i_size); ret = ext4_truncate(inode); if (ret) { /* * We need to clean up the in-core orphan list * manually if ext4_truncate() failed to get a * transaction handle. */ ext4_orphan_del(NULL, inode); ext4_std_error(inode->i_sb, ret); } inode_unlock(inode); (*nr_truncates)++; } else { if (test_opt(sb, DEBUG)) ext4_msg(sb, KERN_DEBUG, "%s: deleting unreferenced inode %lu", __func__, inode->i_ino); ext4_debug("deleting unreferenced inode %lu\n", inode->i_ino); (*nr_orphans)++; } iput(inode); /* The delete magic happens here! */ } /* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at * the superblock) which were deleted from all directories, but held open by * a process at the time of a crash. We walk the list and try to delete these * inodes at recovery time (only with a read-write filesystem). * * In order to keep the orphan inode chain consistent during traversal (in * case of crash during recovery), we link each inode into the superblock * orphan list_head and handle it the same way as an inode deletion during * normal operation (which journals the operations for us). * * We only do an iget() and an iput() on each inode, which is very safe if we * accidentally point at an in-use or already deleted inode. The worst that * can happen in this case is that we get a "bit already cleared" message from * ext4_free_inode(). The only reason we would point at a wrong inode is if * e2fsck was run on this filesystem, and it must have already done the orphan * inode cleanup for us, so we can safely abort without any further action. */ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) { unsigned int s_flags = sb->s_flags; int nr_orphans = 0, nr_truncates = 0; struct inode *inode; int i, j; #ifdef CONFIG_QUOTA int quota_update = 0; #endif __le32 *bdata; struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); if (!es->s_last_orphan && !oi->of_blocks) { ext4_debug("no orphan inodes to clean up\n"); return; } if (bdev_read_only(sb->s_bdev)) { ext4_msg(sb, KERN_ERR, "write access " "unavailable, skipping orphan cleanup"); return; } /* Check if feature set would not allow a r/w mount */ if (!ext4_feature_set_ok(sb, 0)) { ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " "unknown ROCOMPAT features"); return; } if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { /* don't clear list on RO mount w/ errors */ if (es->s_last_orphan && !(s_flags & SB_RDONLY)) { ext4_msg(sb, KERN_INFO, "Errors on filesystem, " "clearing orphan list."); es->s_last_orphan = 0; } ext4_debug("Skipping orphan recovery on fs with errors.\n"); return; } if (s_flags & SB_RDONLY) { ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); sb->s_flags &= ~SB_RDONLY; } #ifdef CONFIG_QUOTA /* * Turn on quotas which were not enabled for read-only mounts if * filesystem has quota feature, so that they are updated correctly. */ if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) { int ret = ext4_enable_quotas(sb); if (!ret) quota_update = 1; else ext4_msg(sb, KERN_ERR, "Cannot turn on quotas: error %d", ret); } /* Turn on journaled quotas used for old sytle */ for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (EXT4_SB(sb)->s_qf_names[i]) { int ret = ext4_quota_on_mount(sb, i); if (!ret) quota_update = 1; else ext4_msg(sb, KERN_ERR, "Cannot turn on journaled " "quota: type %d: error %d", i, ret); } } #endif while (es->s_last_orphan) { /* * We may have encountered an error during cleanup; if * so, skip the rest. */ if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { ext4_debug("Skipping orphan recovery on fs with errors.\n"); es->s_last_orphan = 0; break; } inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); if (IS_ERR(inode)) { es->s_last_orphan = 0; break; } list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); ext4_process_orphan(inode, &nr_truncates, &nr_orphans); } for (i = 0; i < oi->of_blocks; i++) { bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); for (j = 0; j < inodes_per_ob; j++) { if (!bdata[j]) continue; inode = ext4_orphan_get(sb, le32_to_cpu(bdata[j])); if (IS_ERR(inode)) continue; ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE); EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j; ext4_process_orphan(inode, &nr_truncates, &nr_orphans); } } #define PLURAL(x) (x), ((x) == 1) ? "" : "s" if (nr_orphans) ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted", PLURAL(nr_orphans)); if (nr_truncates) ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", PLURAL(nr_truncates)); #ifdef CONFIG_QUOTA /* Turn off quotas if they were enabled for orphan cleanup */ if (quota_update) { for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (sb_dqopt(sb)->files[i]) dquot_quota_off(sb, i); } } #endif sb->s_flags = s_flags; /* Restore SB_RDONLY status */ } void ext4_release_orphan_info(struct super_block *sb) { int i; struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; if (!oi->of_blocks) return; for (i = 0; i < oi->of_blocks; i++) brelse(oi->of_binfo[i].ob_bh); kfree(oi->of_binfo); } static struct ext4_orphan_block_tail *ext4_orphan_block_tail( struct super_block *sb, struct buffer_head *bh) { return (struct ext4_orphan_block_tail *)(bh->b_data + sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)); } static int ext4_orphan_file_block_csum_verify(struct super_block *sb, struct buffer_head *bh) { __u32 calculated; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; struct ext4_orphan_block_tail *ot; __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); if (!ext4_has_metadata_csum(sb)) return 1; ot = ext4_orphan_block_tail(sb, bh); calculated = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); calculated = ext4_chksum(EXT4_SB(sb), calculated, (__u8 *)bh->b_data, inodes_per_ob * sizeof(__u32)); return le32_to_cpu(ot->ob_checksum) == calculated; } /* This gets called only when checksumming is enabled */ void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh, void *data, size_t size) { struct super_block *sb = EXT4_TRIGGER(triggers)->sb; __u32 csum; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; struct ext4_orphan_block_tail *ot; __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); csum = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); csum = ext4_chksum(EXT4_SB(sb), csum, (__u8 *)data, inodes_per_ob * sizeof(__u32)); ot = ext4_orphan_block_tail(sb, bh); ot->ob_checksum = cpu_to_le32(csum); } int ext4_init_orphan_info(struct super_block *sb) { struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; struct inode *inode; int i, j; int ret; int free; __le32 *bdata; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); struct ext4_orphan_block_tail *ot; ino_t orphan_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_orphan_file_inum); if (!ext4_has_feature_orphan_file(sb)) return 0; inode = ext4_iget(sb, orphan_ino, EXT4_IGET_SPECIAL); if (IS_ERR(inode)) { ext4_msg(sb, KERN_ERR, "get orphan inode failed"); return PTR_ERR(inode); } oi->of_blocks = inode->i_size >> sb->s_blocksize_bits; oi->of_csum_seed = EXT4_I(inode)->i_csum_seed; oi->of_binfo = kmalloc(oi->of_blocks*sizeof(struct ext4_orphan_block), GFP_KERNEL); if (!oi->of_binfo) { ret = -ENOMEM; goto out_put; } for (i = 0; i < oi->of_blocks; i++) { oi->of_binfo[i].ob_bh = ext4_bread(NULL, inode, i, 0); if (IS_ERR(oi->of_binfo[i].ob_bh)) { ret = PTR_ERR(oi->of_binfo[i].ob_bh); goto out_free; } if (!oi->of_binfo[i].ob_bh) { ret = -EIO; goto out_free; } ot = ext4_orphan_block_tail(sb, oi->of_binfo[i].ob_bh); if (le32_to_cpu(ot->ob_magic) != EXT4_ORPHAN_BLOCK_MAGIC) { ext4_error(sb, "orphan file block %d: bad magic", i); ret = -EIO; goto out_free; } if (!ext4_orphan_file_block_csum_verify(sb, oi->of_binfo[i].ob_bh)) { ext4_error(sb, "orphan file block %d: bad checksum", i); ret = -EIO; goto out_free; } bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); free = 0; for (j = 0; j < inodes_per_ob; j++) if (bdata[j] == 0) free++; atomic_set(&oi->of_binfo[i].ob_free_entries, free); } iput(inode); return 0; out_free: for (i--; i >= 0; i--) brelse(oi->of_binfo[i].ob_bh); kfree(oi->of_binfo); out_put: iput(inode); return ret; } int ext4_orphan_file_empty(struct super_block *sb) { struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; int i; int inodes_per_ob = ext4_inodes_per_orphan_block(sb); if (!ext4_has_feature_orphan_file(sb)) return 1; for (i = 0; i < oi->of_blocks; i++) if (atomic_read(&oi->of_binfo[i].ob_free_entries) != inodes_per_ob) return 0; return 1; } |
263 13 255 271 271 9 9 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 | // SPDX-License-Identifier: GPL-2.0 /* * DMA memory management for framework level HCD code (hc_driver) * * This implementation plugs in through generic "usb_bus" level methods, * and should work with all USB controllers, regardless of bus type. * * Released under the GPLv2 only. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/device.h> #include <linux/mm.h> #include <linux/io.h> #include <linux/dma-mapping.h> #include <linux/dmapool.h> #include <linux/genalloc.h> #include <linux/usb.h> #include <linux/usb/hcd.h> /* * DMA-Coherent Buffers */ /* FIXME tune these based on pool statistics ... */ static size_t pool_max[HCD_BUFFER_POOLS] = { 32, 128, 512, 2048, }; void __init usb_init_pool_max(void) { /* * The pool_max values must never be smaller than * ARCH_KMALLOC_MINALIGN. */ if (ARCH_KMALLOC_MINALIGN <= 32) ; /* Original value is okay */ else if (ARCH_KMALLOC_MINALIGN <= 64) pool_max[0] = 64; else if (ARCH_KMALLOC_MINALIGN <= 128) pool_max[0] = 0; /* Don't use this pool */ else BUILD_BUG(); /* We don't allow this */ } /* SETUP primitives */ /** * hcd_buffer_create - initialize buffer pools * @hcd: the bus whose buffer pools are to be initialized * * Context: task context, might sleep * * Call this as part of initializing a host controller that uses the dma * memory allocators. It initializes some pools of dma-coherent memory that * will be shared by all drivers using that controller. * * Call hcd_buffer_destroy() to clean up after using those pools. * * Return: 0 if successful. A negative errno value otherwise. */ int hcd_buffer_create(struct usb_hcd *hcd) { char name[16]; int i, size; if (hcd->localmem_pool || !hcd_uses_dma(hcd)) return 0; for (i = 0; i < HCD_BUFFER_POOLS; i++) { size = pool_max[i]; if (!size) continue; snprintf(name, sizeof(name), "buffer-%d", size); hcd->pool[i] = dma_pool_create(name, hcd->self.sysdev, size, size, 0); if (!hcd->pool[i]) { hcd_buffer_destroy(hcd); return -ENOMEM; } } return 0; } /** * hcd_buffer_destroy - deallocate buffer pools * @hcd: the bus whose buffer pools are to be destroyed * * Context: task context, might sleep * * This frees the buffer pools created by hcd_buffer_create(). */ void hcd_buffer_destroy(struct usb_hcd *hcd) { int i; if (!IS_ENABLED(CONFIG_HAS_DMA)) return; for (i = 0; i < HCD_BUFFER_POOLS; i++) { dma_pool_destroy(hcd->pool[i]); hcd->pool[i] = NULL; } } /* sometimes alloc/free could use kmalloc with GFP_DMA, for * better sharing and to leverage mm/slab.c intelligence. */ void *hcd_buffer_alloc( struct usb_bus *bus, size_t size, gfp_t mem_flags, dma_addr_t *dma ) { struct usb_hcd *hcd = bus_to_hcd(bus); int i; if (size == 0) return NULL; if (hcd->localmem_pool) return gen_pool_dma_alloc(hcd->localmem_pool, size, dma); /* some USB hosts just use PIO */ if (!hcd_uses_dma(hcd)) { *dma = ~(dma_addr_t) 0; return kmalloc(size, mem_flags); } for (i = 0; i < HCD_BUFFER_POOLS; i++) { if (size <= pool_max[i]) return dma_pool_alloc(hcd->pool[i], mem_flags, dma); } return dma_alloc_coherent(hcd->self.sysdev, size, dma, mem_flags); } void hcd_buffer_free( struct usb_bus *bus, size_t size, void *addr, dma_addr_t dma ) { struct usb_hcd *hcd = bus_to_hcd(bus); int i; if (!addr) return; if (hcd->localmem_pool) { gen_pool_free(hcd->localmem_pool, (unsigned long)addr, size); return; } if (!hcd_uses_dma(hcd)) { kfree(addr); return; } for (i = 0; i < HCD_BUFFER_POOLS; i++) { if (size <= pool_max[i]) { dma_pool_free(hcd->pool[i], addr, dma); return; } } dma_free_coherent(hcd->self.sysdev, size, addr, dma); } void *hcd_buffer_alloc_pages(struct usb_hcd *hcd, size_t size, gfp_t mem_flags, dma_addr_t *dma) { if (size == 0) return NULL; if (hcd->localmem_pool) return gen_pool_dma_alloc_align(hcd->localmem_pool, size, dma, PAGE_SIZE); /* some USB hosts just use PIO */ if (!hcd_uses_dma(hcd)) { *dma = DMA_MAPPING_ERROR; return (void *)__get_free_pages(mem_flags, get_order(size)); } return dma_alloc_coherent(hcd->self.sysdev, size, dma, mem_flags); } void hcd_buffer_free_pages(struct usb_hcd *hcd, size_t size, void *addr, dma_addr_t dma) { if (!addr) return; if (hcd->localmem_pool) { gen_pool_free(hcd->localmem_pool, (unsigned long)addr, size); return; } if (!hcd_uses_dma(hcd)) { free_pages((unsigned long)addr, get_order(size)); return; } dma_free_coherent(hcd->self.sysdev, size, addr, dma); } |
3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 | // SPDX-License-Identifier: GPL-2.0-or-later /* System trusted keyring for trusted public keys * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/uidgid.h> #include <linux/verification.h> #include <keys/asymmetric-type.h> #include <keys/system_keyring.h> #include <crypto/pkcs7.h> static struct key *builtin_trusted_keys; #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING static struct key *secondary_trusted_keys; #endif #ifdef CONFIG_INTEGRITY_MACHINE_KEYRING static struct key *machine_trusted_keys; #endif #ifdef CONFIG_INTEGRITY_PLATFORM_KEYRING static struct key *platform_trusted_keys; #endif extern __initconst const u8 system_certificate_list[]; extern __initconst const unsigned long system_certificate_list_size; extern __initconst const unsigned long module_cert_size; /** * restrict_link_to_builtin_trusted - Restrict keyring addition by built in CA * * Restrict the addition of keys into a keyring based on the key-to-be-added * being vouched for by a key in the built in system keyring. */ int restrict_link_by_builtin_trusted(struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *restriction_key) { return restrict_link_by_signature(dest_keyring, type, payload, builtin_trusted_keys); } #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING /** * restrict_link_by_builtin_and_secondary_trusted - Restrict keyring * addition by both builtin and secondary keyrings * * Restrict the addition of keys into a keyring based on the key-to-be-added * being vouched for by a key in either the built-in or the secondary system * keyrings. */ int restrict_link_by_builtin_and_secondary_trusted( struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *restrict_key) { /* If we have a secondary trusted keyring, then that contains a link * through to the builtin keyring and the search will follow that link. */ if (type == &key_type_keyring && dest_keyring == secondary_trusted_keys && payload == &builtin_trusted_keys->payload) /* Allow the builtin keyring to be added to the secondary */ return 0; return restrict_link_by_signature(dest_keyring, type, payload, secondary_trusted_keys); } /** * Allocate a struct key_restriction for the "builtin and secondary trust" * keyring. Only for use in system_trusted_keyring_init(). */ static __init struct key_restriction *get_builtin_and_secondary_restriction(void) { struct key_restriction *restriction; restriction = kzalloc(sizeof(struct key_restriction), GFP_KERNEL); if (!restriction) panic("Can't allocate secondary trusted keyring restriction\n"); if (IS_ENABLED(CONFIG_INTEGRITY_MACHINE_KEYRING)) restriction->check = restrict_link_by_builtin_secondary_and_machine; else restriction->check = restrict_link_by_builtin_and_secondary_trusted; return restriction; } #endif #ifdef CONFIG_INTEGRITY_MACHINE_KEYRING void __init set_machine_trusted_keys(struct key *keyring) { machine_trusted_keys = keyring; if (key_link(secondary_trusted_keys, machine_trusted_keys) < 0) panic("Can't link (machine) trusted keyrings\n"); } /** * restrict_link_by_builtin_secondary_and_machine - Restrict keyring addition. * @dest_keyring: Keyring being linked to. * @type: The type of key being added. * @payload: The payload of the new key. * @restrict_key: A ring of keys that can be used to vouch for the new cert. * * Restrict the addition of keys into a keyring based on the key-to-be-added * being vouched for by a key in either the built-in, the secondary, or * the machine keyrings. */ int restrict_link_by_builtin_secondary_and_machine( struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *restrict_key) { if (machine_trusted_keys && type == &key_type_keyring && dest_keyring == secondary_trusted_keys && payload == &machine_trusted_keys->payload) /* Allow the machine keyring to be added to the secondary */ return 0; return restrict_link_by_builtin_and_secondary_trusted(dest_keyring, type, payload, restrict_key); } #endif /* * Create the trusted keyrings */ static __init int system_trusted_keyring_init(void) { pr_notice("Initialise system trusted keyrings\n"); builtin_trusted_keys = keyring_alloc(".builtin_trusted_keys", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), ((KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(builtin_trusted_keys)) panic("Can't allocate builtin trusted keyring\n"); #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING secondary_trusted_keys = keyring_alloc(".secondary_trusted_keys", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), ((KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH | KEY_USR_WRITE), KEY_ALLOC_NOT_IN_QUOTA, get_builtin_and_secondary_restriction(), NULL); if (IS_ERR(secondary_trusted_keys)) panic("Can't allocate secondary trusted keyring\n"); if (key_link(secondary_trusted_keys, builtin_trusted_keys) < 0) panic("Can't link trusted keyrings\n"); #endif return 0; } /* * Must be initialised before we try and load the keys into the keyring. */ device_initcall(system_trusted_keyring_init); __init int load_module_cert(struct key *keyring) { if (!IS_ENABLED(CONFIG_IMA_APPRAISE_MODSIG)) return 0; pr_notice("Loading compiled-in module X.509 certificates\n"); return x509_load_certificate_list(system_certificate_list, module_cert_size, keyring); } /* * Load the compiled-in list of X.509 certificates. */ static __init int load_system_certificate_list(void) { const u8 *p; unsigned long size; pr_notice("Loading compiled-in X.509 certificates\n"); #ifdef CONFIG_MODULE_SIG p = system_certificate_list; size = system_certificate_list_size; #else p = system_certificate_list + module_cert_size; size = system_certificate_list_size - module_cert_size; #endif return x509_load_certificate_list(p, size, builtin_trusted_keys); } late_initcall(load_system_certificate_list); #ifdef CONFIG_SYSTEM_DATA_VERIFICATION /** * verify_pkcs7_message_sig - Verify a PKCS#7-based signature on system data. * @data: The data to be verified (NULL if expecting internal data). * @len: Size of @data. * @pkcs7: The PKCS#7 message that is the signature. * @trusted_keys: Trusted keys to use (NULL for builtin trusted keys only, * (void *)1UL for all trusted keys). * @usage: The use to which the key is being put. * @view_content: Callback to gain access to content. * @ctx: Context for callback. */ int verify_pkcs7_message_sig(const void *data, size_t len, struct pkcs7_message *pkcs7, struct key *trusted_keys, enum key_being_used_for usage, int (*view_content)(void *ctx, const void *data, size_t len, size_t asn1hdrlen), void *ctx) { int ret; /* The data should be detached - so we need to supply it. */ if (data && pkcs7_supply_detached_data(pkcs7, data, len) < 0) { pr_err("PKCS#7 signature with non-detached data\n"); ret = -EBADMSG; goto error; } ret = pkcs7_verify(pkcs7, usage); if (ret < 0) goto error; if (!trusted_keys) { trusted_keys = builtin_trusted_keys; } else if (trusted_keys == VERIFY_USE_SECONDARY_KEYRING) { #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING trusted_keys = secondary_trusted_keys; #else trusted_keys = builtin_trusted_keys; #endif } else if (trusted_keys == VERIFY_USE_PLATFORM_KEYRING) { #ifdef CONFIG_INTEGRITY_PLATFORM_KEYRING trusted_keys = platform_trusted_keys; #else trusted_keys = NULL; #endif if (!trusted_keys) { ret = -ENOKEY; pr_devel("PKCS#7 platform keyring is not available\n"); goto error; } ret = is_key_on_revocation_list(pkcs7); if (ret != -ENOKEY) { pr_devel("PKCS#7 platform key is on revocation list\n"); goto error; } } ret = pkcs7_validate_trust(pkcs7, trusted_keys); if (ret < 0) { if (ret == -ENOKEY) pr_devel("PKCS#7 signature not signed with a trusted key\n"); goto error; } if (view_content) { size_t asn1hdrlen; ret = pkcs7_get_content_data(pkcs7, &data, &len, &asn1hdrlen); if (ret < 0) { if (ret == -ENODATA) pr_devel("PKCS#7 message does not contain data\n"); goto error; } ret = view_content(ctx, data, len, asn1hdrlen); } error: pr_devel("<==%s() = %d\n", __func__, ret); return ret; } /** * verify_pkcs7_signature - Verify a PKCS#7-based signature on system data. * @data: The data to be verified (NULL if expecting internal data). * @len: Size of @data. * @raw_pkcs7: The PKCS#7 message that is the signature. * @pkcs7_len: The size of @raw_pkcs7. * @trusted_keys: Trusted keys to use (NULL for builtin trusted keys only, * (void *)1UL for all trusted keys). * @usage: The use to which the key is being put. * @view_content: Callback to gain access to content. * @ctx: Context for callback. */ int verify_pkcs7_signature(const void *data, size_t len, const void *raw_pkcs7, size_t pkcs7_len, struct key *trusted_keys, enum key_being_used_for usage, int (*view_content)(void *ctx, const void *data, size_t len, size_t asn1hdrlen), void *ctx) { struct pkcs7_message *pkcs7; int ret; pkcs7 = pkcs7_parse_message(raw_pkcs7, pkcs7_len); if (IS_ERR(pkcs7)) return PTR_ERR(pkcs7); ret = verify_pkcs7_message_sig(data, len, pkcs7, trusted_keys, usage, view_content, ctx); pkcs7_free_message(pkcs7); pr_devel("<==%s() = %d\n", __func__, ret); return ret; } EXPORT_SYMBOL_GPL(verify_pkcs7_signature); #endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ #ifdef CONFIG_INTEGRITY_PLATFORM_KEYRING void __init set_platform_trusted_keys(struct key *keyring) { platform_trusted_keys = keyring; } #endif |
31530 22 22 22 22 22 21 22 22 22 22 22 22 6502 6509 21 21 6509 6501 6510 6502 6510 6507 6510 9052 9043 9049 3029 9042 9060 553 9046 9060 9060 9060 9047 9045 21 9060 22 21 22 331 9069 6951 6949 8999 9000 2 7674 6208 7635 7672 124 9046 9079 9077 6507 8919 7837 9050 8999 811 1354 6772 6500 1294 8999 8986 9000 8999 810 811 811 811 6509 6507 1762 6289 5736 58 6327 1604 738 6330 6270 6509 67 67 9000 9000 9000 347 8915 8928 8912 1294 8368 211 810 1294 8999 8374 808 9000 9000 8837 4924 3 8987 8997 9000 8987 67 63 67 67 67 67 6501 6500 6509 6495 75 6208 7675 1 4 67 67 4 4 67 4 4 67 67 67 1 1 1 1 1 1 1 1 67 67 67 67 67 1 1 1 1 1 1 8984 8984 8831 344 859 7181 859 6504 6509 67 67 6453 67 67 67 67 6497 6509 6492 7 759 6505 6476 6469 14 75 75 75 6497 6519 56 6497 7 7 344 344 344 22 6733 8834 2723 2712 24 2725 22 2727 113 8827 8844 7835 6978 6983 2 8828 8837 2 8829 8840 11 8826 8844 8831 6914 5630 6620 30 13 10 10 1 6 9 10 202 202 202 200 202 202 202 202 202 202 202 202 202 202 202 6 6 6 2 6 6 6 2 6 4 6 4 6 2 4 6 6 2 6 6 2 6 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1993 Linus Torvalds * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 * Numa awareness, Christoph Lameter, SGI, June 2005 * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019 */ #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/highmem.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/interrupt.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/set_memory.h> #include <linux/debugobjects.h> #include <linux/kallsyms.h> #include <linux/list.h> #include <linux/notifier.h> #include <linux/rbtree.h> #include <linux/xarray.h> #include <linux/io.h> #include <linux/rcupdate.h> #include <linux/pfn.h> #include <linux/kmemleak.h> #include <linux/atomic.h> #include <linux/compiler.h> #include <linux/memcontrol.h> #include <linux/llist.h> #include <linux/bitops.h> #include <linux/rbtree_augmented.h> #include <linux/overflow.h> #include <linux/pgtable.h> #include <linux/uaccess.h> #include <linux/hugetlb.h> #include <linux/sched/mm.h> #include <linux/io.h> #include <asm/tlbflush.h> #include <asm/shmparam.h> #include "internal.h" #include "pgalloc-track.h" #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1; static int __init set_nohugeiomap(char *str) { ioremap_max_page_shift = PAGE_SHIFT; return 0; } early_param("nohugeiomap", set_nohugeiomap); #else /* CONFIG_HAVE_ARCH_HUGE_VMAP */ static const unsigned int ioremap_max_page_shift = PAGE_SHIFT; #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC static bool __ro_after_init vmap_allow_huge = true; static int __init set_nohugevmalloc(char *str) { vmap_allow_huge = false; return 0; } early_param("nohugevmalloc", set_nohugevmalloc); #else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ static const bool vmap_allow_huge = false; #endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ bool is_vmalloc_addr(const void *x) { unsigned long addr = (unsigned long)kasan_reset_tag(x); return addr >= VMALLOC_START && addr < VMALLOC_END; } EXPORT_SYMBOL(is_vmalloc_addr); struct vfree_deferred { struct llist_head list; struct work_struct wq; }; static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred); static void __vunmap(const void *, int); static void free_work(struct work_struct *w) { struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); struct llist_node *t, *llnode; llist_for_each_safe(llnode, t, llist_del_all(&p->list)) __vunmap((void *)llnode, 1); } /*** Page table manipulation functions ***/ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift, pgtbl_mod_mask *mask) { pte_t *pte; u64 pfn; unsigned long size = PAGE_SIZE; pfn = phys_addr >> PAGE_SHIFT; pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; do { BUG_ON(!pte_none(*pte)); #ifdef CONFIG_HUGETLB_PAGE size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift); if (size != PAGE_SIZE) { pte_t entry = pfn_pte(pfn, prot); entry = arch_make_huge_pte(entry, ilog2(size), 0); set_huge_pte_at(&init_mm, addr, pte, entry); pfn += PFN_DOWN(size); continue; } #endif set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); pfn++; } while (pte += PFN_DOWN(size), addr += size, addr != end); *mask |= PGTBL_PTE_MODIFIED; return 0; } static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift) { if (max_page_shift < PMD_SHIFT) return 0; if (!arch_vmap_pmd_supported(prot)) return 0; if ((end - addr) != PMD_SIZE) return 0; if (!IS_ALIGNED(addr, PMD_SIZE)) return 0; if (!IS_ALIGNED(phys_addr, PMD_SIZE)) return 0; if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) return 0; return pmd_set_huge(pmd, phys_addr, prot); } static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift, pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; pmd = pmd_alloc_track(&init_mm, pud, addr, mask); if (!pmd) return -ENOMEM; do { next = pmd_addr_end(addr, end); if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, max_page_shift)) { *mask |= PGTBL_PMD_MODIFIED; continue; } if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask)) return -ENOMEM; } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); return 0; } static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift) { if (max_page_shift < PUD_SHIFT) return 0; if (!arch_vmap_pud_supported(prot)) return 0; if ((end - addr) != PUD_SIZE) return 0; if (!IS_ALIGNED(addr, PUD_SIZE)) return 0; if (!IS_ALIGNED(phys_addr, PUD_SIZE)) return 0; if (pud_present(*pud) && !pud_free_pmd_page(pud, addr)) return 0; return pud_set_huge(pud, phys_addr, prot); } static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift, pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; pud = pud_alloc_track(&init_mm, p4d, addr, mask); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, max_page_shift)) { *mask |= PGTBL_PUD_MODIFIED; continue; } if (vmap_pmd_range(pud, addr, next, phys_addr, prot, max_page_shift, mask)) return -ENOMEM; } while (pud++, phys_addr += (next - addr), addr = next, addr != end); return 0; } static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift) { if (max_page_shift < P4D_SHIFT) return 0; if (!arch_vmap_p4d_supported(prot)) return 0; if ((end - addr) != P4D_SIZE) return 0; if (!IS_ALIGNED(addr, P4D_SIZE)) return 0; if (!IS_ALIGNED(phys_addr, P4D_SIZE)) return 0; if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr)) return 0; return p4d_set_huge(p4d, phys_addr, prot); } static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift, pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, max_page_shift)) { *mask |= PGTBL_P4D_MODIFIED; continue; } if (vmap_pud_range(p4d, addr, next, phys_addr, prot, max_page_shift, mask)) return -ENOMEM; } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); return 0; } static int vmap_range_noflush(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift) { pgd_t *pgd; unsigned long start; unsigned long next; int err; pgtbl_mod_mask mask = 0; might_sleep(); BUG_ON(addr >= end); start = addr; pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, max_page_shift, &mask); if (err) break; } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); if (mask & ARCH_PAGE_TABLE_SYNC_MASK) arch_sync_kernel_mappings(start, end); return err; } int ioremap_page_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { int err; prot = pgprot_nx(prot); err = vmap_range_noflush(addr, end, phys_addr, prot, ioremap_max_page_shift); flush_cache_vmap(addr, end); if (!err) err = kmsan_ioremap_page_range(addr, end, phys_addr, prot, ioremap_max_page_shift); if (IS_ENABLED(CONFIG_ARCH_HAS_IOREMAP_PHYS_HOOKS) && !err) ioremap_phys_range_hook(phys_addr, end - addr, prot); return err; } static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { pte_t *pte; pte = pte_offset_kernel(pmd, addr); do { pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); WARN_ON(!pte_none(ptent) && !pte_present(ptent)); } while (pte++, addr += PAGE_SIZE, addr != end); *mask |= PGTBL_PTE_MODIFIED; } static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; int cleared; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); cleared = pmd_clear_huge(pmd); if (cleared || pmd_bad(*pmd)) *mask |= PGTBL_PMD_MODIFIED; if (cleared) continue; if (pmd_none_or_clear_bad(pmd)) continue; vunmap_pte_range(pmd, addr, next, mask); cond_resched(); } while (pmd++, addr = next, addr != end); } static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; int cleared; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); cleared = pud_clear_huge(pud); if (cleared || pud_bad(*pud)) *mask |= PGTBL_PUD_MODIFIED; if (cleared) continue; if (pud_none_or_clear_bad(pud)) continue; vunmap_pmd_range(pud, addr, next, mask); } while (pud++, addr = next, addr != end); } static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); p4d_clear_huge(p4d); if (p4d_bad(*p4d)) *mask |= PGTBL_P4D_MODIFIED; if (p4d_none_or_clear_bad(p4d)) continue; vunmap_pud_range(p4d, addr, next, mask); } while (p4d++, addr = next, addr != end); } /* * vunmap_range_noflush is similar to vunmap_range, but does not * flush caches or TLBs. * * The caller is responsible for calling flush_cache_vmap() before calling * this function, and flush_tlb_kernel_range after it has returned * successfully (and before the addresses are expected to cause a page fault * or be re-mapped for something else, if TLB flushes are being delayed or * coalesced). * * This is an internal function only. Do not use outside mm/. */ void __vunmap_range_noflush(unsigned long start, unsigned long end) { unsigned long next; pgd_t *pgd; unsigned long addr = start; pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); if (pgd_bad(*pgd)) mask |= PGTBL_PGD_MODIFIED; if (pgd_none_or_clear_bad(pgd)) continue; vunmap_p4d_range(pgd, addr, next, &mask); } while (pgd++, addr = next, addr != end); if (mask & ARCH_PAGE_TABLE_SYNC_MASK) arch_sync_kernel_mappings(start, end); } void vunmap_range_noflush(unsigned long start, unsigned long end) { kmsan_vunmap_range_noflush(start, end); __vunmap_range_noflush(start, end); } /** * vunmap_range - unmap kernel virtual addresses * @addr: start of the VM area to unmap * @end: end of the VM area to unmap (non-inclusive) * * Clears any present PTEs in the virtual address range, flushes TLBs and * caches. Any subsequent access to the address before it has been re-mapped * is a kernel bug. */ void vunmap_range(unsigned long addr, unsigned long end) { flush_cache_vunmap(addr, end); vunmap_range_noflush(addr, end); flush_tlb_kernel_range(addr, end); } static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { pte_t *pte; /* * nr is a running index into the array which helps higher level * callers keep track of where we're up to. */ pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; do { struct page *page = pages[*nr]; if (WARN_ON(!pte_none(*pte))) return -EBUSY; if (WARN_ON(!page)) return -ENOMEM; if (WARN_ON(!pfn_valid(page_to_pfn(page)))) return -EINVAL; set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); *mask |= PGTBL_PTE_MODIFIED; return 0; } static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; pmd = pmd_alloc_track(&init_mm, pud, addr, mask); if (!pmd) return -ENOMEM; do { next = pmd_addr_end(addr, end); if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pmd++, addr = next, addr != end); return 0; } static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; pud = pud_alloc_track(&init_mm, p4d, addr, mask); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pud++, addr = next, addr != end); return 0; } static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (p4d++, addr = next, addr != end); return 0; } static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages) { unsigned long start = addr; pgd_t *pgd; unsigned long next; int err = 0; int nr = 0; pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); if (pgd_bad(*pgd)) mask |= PGTBL_PGD_MODIFIED; err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); if (err) return err; } while (pgd++, addr = next, addr != end); if (mask & ARCH_PAGE_TABLE_SYNC_MASK) arch_sync_kernel_mappings(start, end); return 0; } /* * vmap_pages_range_noflush is similar to vmap_pages_range, but does not * flush caches. * * The caller is responsible for calling flush_cache_vmap() after this * function returns successfully and before the addresses are accessed. * * This is an internal function only. Do not use outside mm/. */ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { unsigned int i, nr = (end - addr) >> PAGE_SHIFT; WARN_ON(page_shift < PAGE_SHIFT); if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) || page_shift == PAGE_SHIFT) return vmap_small_pages_range_noflush(addr, end, prot, pages); for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { int err; err = vmap_range_noflush(addr, addr + (1UL << page_shift), page_to_phys(pages[i]), prot, page_shift); if (err) return err; addr += 1UL << page_shift; } return 0; } int vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages, page_shift); if (ret) return ret; return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); } /** * vmap_pages_range - map pages to a kernel virtual address * @addr: start of the VM area to map * @end: end of the VM area to map (non-inclusive) * @prot: page protection flags to use * @pages: pages to map (always PAGE_SIZE pages) * @page_shift: maximum shift that the pages may be mapped with, @pages must * be aligned and contiguous up to at least this shift. * * RETURNS: * 0 on success, -errno on failure. */ static int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { int err; err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); flush_cache_vmap(addr, end); return err; } int is_vmalloc_or_module_addr(const void *x) { /* * ARM, x86-64 and sparc64 put modules in a special place, * and fall back on vmalloc() if that fails. Others * just put it in the vmalloc space. */ #if defined(CONFIG_MODULES) && defined(MODULES_VADDR) unsigned long addr = (unsigned long)kasan_reset_tag(x); if (addr >= MODULES_VADDR && addr < MODULES_END) return 1; #endif return is_vmalloc_addr(x); } /* * Walk a vmap address to the struct page it maps. Huge vmap mappings will * return the tail page that corresponds to the base page address, which * matches small vmap mappings. */ struct page *vmalloc_to_page(const void *vmalloc_addr) { unsigned long addr = (unsigned long) vmalloc_addr; struct page *page = NULL; pgd_t *pgd = pgd_offset_k(addr); p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *ptep, pte; /* * XXX we might need to change this if we add VIRTUAL_BUG_ON for * architectures that do not vmalloc module space */ VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); if (pgd_none(*pgd)) return NULL; if (WARN_ON_ONCE(pgd_leaf(*pgd))) return NULL; /* XXX: no allowance for huge pgd */ if (WARN_ON_ONCE(pgd_bad(*pgd))) return NULL; p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) return NULL; if (p4d_leaf(*p4d)) return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); if (WARN_ON_ONCE(p4d_bad(*p4d))) return NULL; pud = pud_offset(p4d, addr); if (pud_none(*pud)) return NULL; if (pud_leaf(*pud)) return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); if (WARN_ON_ONCE(pud_bad(*pud))) return NULL; pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) return NULL; if (pmd_leaf(*pmd)) return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); if (WARN_ON_ONCE(pmd_bad(*pmd))) return NULL; ptep = pte_offset_map(pmd, addr); pte = *ptep; if (pte_present(pte)) page = pte_page(pte); pte_unmap(ptep); return page; } EXPORT_SYMBOL(vmalloc_to_page); /* * Map a vmalloc()-space virtual address to the physical page frame number. */ unsigned long vmalloc_to_pfn(const void *vmalloc_addr) { return page_to_pfn(vmalloc_to_page(vmalloc_addr)); } EXPORT_SYMBOL(vmalloc_to_pfn); /*** Global kva allocator ***/ #define DEBUG_AUGMENT_PROPAGATE_CHECK 0 #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 static DEFINE_SPINLOCK(vmap_area_lock); static DEFINE_SPINLOCK(free_vmap_area_lock); /* Export for kexec only */ LIST_HEAD(vmap_area_list); static struct rb_root vmap_area_root = RB_ROOT; static bool vmap_initialized __read_mostly; static struct rb_root purge_vmap_area_root = RB_ROOT; static LIST_HEAD(purge_vmap_area_list); static DEFINE_SPINLOCK(purge_vmap_area_lock); /* * This kmem_cache is used for vmap_area objects. Instead of * allocating from slab we reuse an object from this cache to * make things faster. Especially in "no edge" splitting of * free block. */ static struct kmem_cache *vmap_area_cachep; /* * This linked list is used in pair with free_vmap_area_root. * It gives O(1) access to prev/next to perform fast coalescing. */ static LIST_HEAD(free_vmap_area_list); /* * This augment red-black tree represents the free vmap space. * All vmap_area objects in this tree are sorted by va->va_start * address. It is used for allocation and merging when a vmap * object is released. * * Each vmap_area node contains a maximum available free block * of its sub-tree, right or left. Therefore it is possible to * find a lowest match of free area. */ static struct rb_root free_vmap_area_root = RB_ROOT; /* * Preload a CPU with one object for "no edge" split case. The * aim is to get rid of allocations from the atomic context, thus * to use more permissive allocation masks. */ static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node); static __always_inline unsigned long va_size(struct vmap_area *va) { return (va->va_end - va->va_start); } static __always_inline unsigned long get_subtree_max_size(struct rb_node *node) { struct vmap_area *va; va = rb_entry_safe(node, struct vmap_area, rb_node); return va ? va->subtree_max_size : 0; } RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size) static void purge_vmap_area_lazy(void); static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); static void drain_vmap_area_work(struct work_struct *work); static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work); static atomic_long_t nr_vmalloc_pages; unsigned long vmalloc_nr_pages(void) { return atomic_long_read(&nr_vmalloc_pages); } EXPORT_SYMBOL_GPL(vmalloc_nr_pages); /* Look up the first VA which satisfies addr < va_end, NULL if none. */ static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) { struct vmap_area *va = NULL; struct rb_node *n = vmap_area_root.rb_node; addr = (unsigned long)kasan_reset_tag((void *)addr); while (n) { struct vmap_area *tmp; tmp = rb_entry(n, struct vmap_area, rb_node); if (tmp->va_end > addr) { va = tmp; if (tmp->va_start <= addr) break; n = n->rb_left; } else n = n->rb_right; } return va; } static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root) { struct rb_node *n = root->rb_node; addr = (unsigned long)kasan_reset_tag((void *)addr); while (n) { struct vmap_area *va; va = rb_entry(n, struct vmap_area, rb_node); if (addr < va->va_start) n = n->rb_left; else if (addr >= va->va_end) n = n->rb_right; else return va; } return NULL; } /* * This function returns back addresses of parent node * and its left or right link for further processing. * * Otherwise NULL is returned. In that case all further * steps regarding inserting of conflicting overlap range * have to be declined and actually considered as a bug. */ static __always_inline struct rb_node ** find_va_links(struct vmap_area *va, struct rb_root *root, struct rb_node *from, struct rb_node **parent) { struct vmap_area *tmp_va; struct rb_node **link; if (root) { link = &root->rb_node; if (unlikely(!*link)) { *parent = NULL; return link; } } else { link = &from; } /* * Go to the bottom of the tree. When we hit the last point * we end up with parent rb_node and correct direction, i name * it link, where the new va->rb_node will be attached to. */ do { tmp_va = rb_entry(*link, struct vmap_area, rb_node); /* * During the traversal we also do some sanity check. * Trigger the BUG() if there are sides(left/right) * or full overlaps. */ if (va->va_end <= tmp_va->va_start) link = &(*link)->rb_left; else if (va->va_start >= tmp_va->va_end) link = &(*link)->rb_right; else { WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n", va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end); return NULL; } } while (*link); *parent = &tmp_va->rb_node; return link; } static __always_inline struct list_head * get_va_next_sibling(struct rb_node *parent, struct rb_node **link) { struct list_head *list; if (unlikely(!parent)) /* * The red-black tree where we try to find VA neighbors * before merging or inserting is empty, i.e. it means * there is no free vmap space. Normally it does not * happen but we handle this case anyway. */ return NULL; list = &rb_entry(parent, struct vmap_area, rb_node)->list; return (&parent->rb_right == link ? list->next : list); } static __always_inline void __link_va(struct vmap_area *va, struct rb_root *root, struct rb_node *parent, struct rb_node **link, struct list_head *head, bool augment) { /* * VA is still not in the list, but we can * identify its future previous list_head node. */ if (likely(parent)) { head = &rb_entry(parent, struct vmap_area, rb_node)->list; if (&parent->rb_right != link) head = head->prev; } /* Insert to the rb-tree */ rb_link_node(&va->rb_node, parent, link); if (augment) { /* * Some explanation here. Just perform simple insertion * to the tree. We do not set va->subtree_max_size to * its current size before calling rb_insert_augmented(). * It is because we populate the tree from the bottom * to parent levels when the node _is_ in the tree. * * Therefore we set subtree_max_size to zero after insertion, * to let __augment_tree_propagate_from() puts everything to * the correct order later on. */ rb_insert_augmented(&va->rb_node, root, &free_vmap_area_rb_augment_cb); va->subtree_max_size = 0; } else { rb_insert_color(&va->rb_node, root); } /* Address-sort this list */ list_add(&va->list, head); } static __always_inline void link_va(struct vmap_area *va, struct rb_root *root, struct rb_node *parent, struct rb_node **link, struct list_head *head) { __link_va(va, root, parent, link, head, false); } static __always_inline void link_va_augment(struct vmap_area *va, struct rb_root *root, struct rb_node *parent, struct rb_node **link, struct list_head *head) { __link_va(va, root, parent, link, head, true); } static __always_inline void __unlink_va(struct vmap_area *va, struct rb_root *root, bool augment) { if (WARN_ON(RB_EMPTY_NODE(&va->rb_node))) return; if (augment) rb_erase_augmented(&va->rb_node, root, &free_vmap_area_rb_augment_cb); else rb_erase(&va->rb_node, root); list_del_init(&va->list); RB_CLEAR_NODE(&va->rb_node); } static __always_inline void unlink_va(struct vmap_area *va, struct rb_root *root) { __unlink_va(va, root, false); } static __always_inline void unlink_va_augment(struct vmap_area *va, struct rb_root *root) { __unlink_va(va, root, true); } #if DEBUG_AUGMENT_PROPAGATE_CHECK /* * Gets called when remove the node and rotate. */ static __always_inline unsigned long compute_subtree_max_size(struct vmap_area *va) { return max3(va_size(va), get_subtree_max_size(va->rb_node.rb_left), get_subtree_max_size(va->rb_node.rb_right)); } static void augment_tree_propagate_check(void) { struct vmap_area *va; unsigned long computed_size; list_for_each_entry(va, &free_vmap_area_list, list) { computed_size = compute_subtree_max_size(va); if (computed_size != va->subtree_max_size) pr_emerg("tree is corrupted: %lu, %lu\n", va_size(va), va->subtree_max_size); } } #endif /* * This function populates subtree_max_size from bottom to upper * levels starting from VA point. The propagation must be done * when VA size is modified by changing its va_start/va_end. Or * in case of newly inserting of VA to the tree. * * It means that __augment_tree_propagate_from() must be called: * - After VA has been inserted to the tree(free path); * - After VA has been shrunk(allocation path); * - After VA has been increased(merging path). * * Please note that, it does not mean that upper parent nodes * and their subtree_max_size are recalculated all the time up * to the root node. * * 4--8 * /\ * / \ * / \ * 2--2 8--8 * * For example if we modify the node 4, shrinking it to 2, then * no any modification is required. If we shrink the node 2 to 1 * its subtree_max_size is updated only, and set to 1. If we shrink * the node 8 to 6, then its subtree_max_size is set to 6 and parent * node becomes 4--6. */ static __always_inline void augment_tree_propagate_from(struct vmap_area *va) { /* * Populate the tree from bottom towards the root until * the calculated maximum available size of checked node * is equal to its current one. */ free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL); #if DEBUG_AUGMENT_PROPAGATE_CHECK augment_tree_propagate_check(); #endif } static void insert_vmap_area(struct vmap_area *va, struct rb_root *root, struct list_head *head) { struct rb_node **link; struct rb_node *parent; link = find_va_links(va, root, NULL, &parent); if (link) link_va(va, root, parent, link, head); } static void insert_vmap_area_augment(struct vmap_area *va, struct rb_node *from, struct rb_root *root, struct list_head *head) { struct rb_node **link; struct rb_node *parent; if (from) link = find_va_links(va, NULL, from, &parent); else link = find_va_links(va, root, NULL, &parent); if (link) { link_va_augment(va, root, parent, link, head); augment_tree_propagate_from(va); } } /* * Merge de-allocated chunk of VA memory with previous * and next free blocks. If coalesce is not done a new * free area is inserted. If VA has been merged, it is * freed. * * Please note, it can return NULL in case of overlap * ranges, followed by WARN() report. Despite it is a * buggy behaviour, a system can be alive and keep * ongoing. */ static __always_inline struct vmap_area * __merge_or_add_vmap_area(struct vmap_area *va, struct rb_root *root, struct list_head *head, bool augment) { struct vmap_area *sibling; struct list_head *next; struct rb_node **link; struct rb_node *parent; bool merged = false; /* * Find a place in the tree where VA potentially will be * inserted, unless it is merged with its sibling/siblings. */ link = find_va_links(va, root, NULL, &parent); if (!link) return NULL; /* * Get next node of VA to check if merging can be done. */ next = get_va_next_sibling(parent, link); if (unlikely(next == NULL)) goto insert; /* * start end * | | * |<------VA------>|<-----Next----->| * | | * start end */ if (next != head) { sibling = list_entry(next, struct vmap_area, list); if (sibling->va_start == va->va_end) { sibling->va_start = va->va_start; /* Free vmap_area object. */ kmem_cache_free(vmap_area_cachep, va); /* Point to the new merged area. */ va = sibling; merged = true; } } /* * start end * | | * |<-----Prev----->|<------VA------>| * | | * start end */ if (next->prev != head) { sibling = list_entry(next->prev, struct vmap_area, list); if (sibling->va_end == va->va_start) { /* * If both neighbors are coalesced, it is important * to unlink the "next" node first, followed by merging * with "previous" one. Otherwise the tree might not be * fully populated if a sibling's augmented value is * "normalized" because of rotation operations. */ if (merged) __unlink_va(va, root, augment); sibling->va_end = va->va_end; /* Free vmap_area object. */ kmem_cache_free(vmap_area_cachep, va); /* Point to the new merged area. */ va = sibling; merged = true; } } insert: if (!merged) __link_va(va, root, parent, link, head, augment); return va; } static __always_inline struct vmap_area * merge_or_add_vmap_area(struct vmap_area *va, struct rb_root *root, struct list_head *head) { return __merge_or_add_vmap_area(va, root, head, false); } static __always_inline struct vmap_area * merge_or_add_vmap_area_augment(struct vmap_area *va, struct rb_root *root, struct list_head *head) { va = __merge_or_add_vmap_area(va, root, head, true); if (va) augment_tree_propagate_from(va); return va; } static __always_inline bool is_within_this_va(struct vmap_area *va, unsigned long size, unsigned long align, unsigned long vstart) { unsigned long nva_start_addr; if (va->va_start > vstart) nva_start_addr = ALIGN(va->va_start, align); else nva_start_addr = ALIGN(vstart, align); /* Can be overflowed due to big size or alignment. */ if (nva_start_addr + size < nva_start_addr || nva_start_addr < vstart) return false; return (nva_start_addr + size <= va->va_end); } /* * Find the first free block(lowest start address) in the tree, * that will accomplish the request corresponding to passing * parameters. Please note, with an alignment bigger than PAGE_SIZE, * a search length is adjusted to account for worst case alignment * overhead. */ static __always_inline struct vmap_area * find_vmap_lowest_match(struct rb_root *root, unsigned long size, unsigned long align, unsigned long vstart, bool adjust_search_size) { struct vmap_area *va; struct rb_node *node; unsigned long length; /* Start from the root. */ node = root->rb_node; /* Adjust the search size for alignment overhead. */ length = adjust_search_size ? size + align - 1 : size; while (node) { va = rb_entry(node, struct vmap_area, rb_node); if (get_subtree_max_size(node->rb_left) >= length && vstart < va->va_start) { node = node->rb_left; } else { if (is_within_this_va(va, size, align, vstart)) return va; /* * Does not make sense to go deeper towards the right * sub-tree if it does not have a free block that is * equal or bigger to the requested search length. */ if (get_subtree_max_size(node->rb_right) >= length) { node = node->rb_right; continue; } /* * OK. We roll back and find the first right sub-tree, * that will satisfy the search criteria. It can happen * due to "vstart" restriction or an alignment overhead * that is bigger then PAGE_SIZE. */ while ((node = rb_parent(node))) { va = rb_entry(node, struct vmap_area, rb_node); if (is_within_this_va(va, size, align, vstart)) return va; if (get_subtree_max_size(node->rb_right) >= length && vstart <= va->va_start) { /* * Shift the vstart forward. Please note, we update it with * parent's start address adding "1" because we do not want * to enter same sub-tree after it has already been checked * and no suitable free block found there. */ vstart = va->va_start + 1; node = node->rb_right; break; } } } } return NULL; } #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK #include <linux/random.h> static struct vmap_area * find_vmap_lowest_linear_match(struct list_head *head, unsigned long size, unsigned long align, unsigned long vstart) { struct vmap_area *va; list_for_each_entry(va, head, list) { if (!is_within_this_va(va, size, align, vstart)) continue; return va; } return NULL; } static void find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head, unsigned long size, unsigned long align) { struct vmap_area *va_1, *va_2; unsigned long vstart; unsigned int rnd; get_random_bytes(&rnd, sizeof(rnd)); vstart = VMALLOC_START + rnd; va_1 = find_vmap_lowest_match(root, size, align, vstart, false); va_2 = find_vmap_lowest_linear_match(head, size, align, vstart); if (va_1 != va_2) pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", va_1, va_2, vstart); } #endif enum fit_type { NOTHING_FIT = 0, FL_FIT_TYPE = 1, /* full fit */ LE_FIT_TYPE = 2, /* left edge fit */ RE_FIT_TYPE = 3, /* right edge fit */ NE_FIT_TYPE = 4 /* no edge fit */ }; static __always_inline enum fit_type classify_va_fit_type(struct vmap_area *va, unsigned long nva_start_addr, unsigned long size) { enum fit_type type; /* Check if it is within VA. */ if (nva_start_addr < va->va_start || nva_start_addr + size > va->va_end) return NOTHING_FIT; /* Now classify. */ if (va->va_start == nva_start_addr) { if (va->va_end == nva_start_addr + size) type = FL_FIT_TYPE; else type = LE_FIT_TYPE; } else if (va->va_end == nva_start_addr + size) { type = RE_FIT_TYPE; } else { type = NE_FIT_TYPE; } return type; } static __always_inline int adjust_va_to_fit_type(struct rb_root *root, struct list_head *head, struct vmap_area *va, unsigned long nva_start_addr, unsigned long size) { struct vmap_area *lva = NULL; enum fit_type type = classify_va_fit_type(va, nva_start_addr, size); if (type == FL_FIT_TYPE) { /* * No need to split VA, it fully fits. * * | | * V NVA V * |---------------| */ unlink_va_augment(va, root); kmem_cache_free(vmap_area_cachep, va); } else if (type == LE_FIT_TYPE) { /* * Split left edge of fit VA. * * | | * V NVA V R * |-------|-------| */ va->va_start += size; } else if (type == RE_FIT_TYPE) { /* * Split right edge of fit VA. * * | | * L V NVA V * |-------|-------| */ va->va_end = nva_start_addr; } else if (type == NE_FIT_TYPE) { /* * Split no edge of fit VA. * * | | * L V NVA V R * |---|-------|---| */ lva = __this_cpu_xchg(ne_fit_preload_node, NULL); if (unlikely(!lva)) { /* * For percpu allocator we do not do any pre-allocation * and leave it as it is. The reason is it most likely * never ends up with NE_FIT_TYPE splitting. In case of * percpu allocations offsets and sizes are aligned to * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE * are its main fitting cases. * * There are a few exceptions though, as an example it is * a first allocation (early boot up) when we have "one" * big free space that has to be split. * * Also we can hit this path in case of regular "vmap" * allocations, if "this" current CPU was not preloaded. * See the comment in alloc_vmap_area() why. If so, then * GFP_NOWAIT is used instead to get an extra object for * split purpose. That is rare and most time does not * occur. * * What happens if an allocation gets failed. Basically, * an "overflow" path is triggered to purge lazily freed * areas to free some memory, then, the "retry" path is * triggered to repeat one more time. See more details * in alloc_vmap_area() function. */ lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); if (!lva) return -1; } /* * Build the remainder. */ lva->va_start = va->va_start; lva->va_end = nva_start_addr; /* * Shrink this VA to remaining size. */ va->va_start = nva_start_addr + size; } else { return -1; } if (type != FL_FIT_TYPE) { augment_tree_propagate_from(va); if (lva) /* type == NE_FIT_TYPE */ insert_vmap_area_augment(lva, &va->rb_node, root, head); } return 0; } /* * Returns a start address of the newly allocated area, if success. * Otherwise a vend is returned that indicates failure. */ static __always_inline unsigned long __alloc_vmap_area(struct rb_root *root, struct list_head *head, unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend) { bool adjust_search_size = true; unsigned long nva_start_addr; struct vmap_area *va; int ret; /* * Do not adjust when: * a) align <= PAGE_SIZE, because it does not make any sense. * All blocks(their start addresses) are at least PAGE_SIZE * aligned anyway; * b) a short range where a requested size corresponds to exactly * specified [vstart:vend] interval and an alignment > PAGE_SIZE. * With adjusted search length an allocation would not succeed. */ if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size)) adjust_search_size = false; va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size); if (unlikely(!va)) return vend; if (va->va_start > vstart) nva_start_addr = ALIGN(va->va_start, align); else nva_start_addr = ALIGN(vstart, align); /* Check the "vend" restriction. */ if (nva_start_addr + size > vend) return vend; /* Update the free vmap_area. */ ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size); if (WARN_ON_ONCE(ret)) return vend; #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK find_vmap_lowest_match_check(root, head, size, align); #endif return nva_start_addr; } /* * Free a region of KVA allocated by alloc_vmap_area */ static void free_vmap_area(struct vmap_area *va) { /* * Remove from the busy tree/list. */ spin_lock(&vmap_area_lock); unlink_va(va, &vmap_area_root); spin_unlock(&vmap_area_lock); /* * Insert/Merge it back to the free tree/list. */ spin_lock(&free_vmap_area_lock); merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list); spin_unlock(&free_vmap_area_lock); } static inline void preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) { struct vmap_area *va = NULL; /* * Preload this CPU with one extra vmap_area object. It is used * when fit type of free area is NE_FIT_TYPE. It guarantees that * a CPU that does an allocation is preloaded. * * We do it in non-atomic context, thus it allows us to use more * permissive allocation masks to be more stable under low memory * condition and high memory pressure. */ if (!this_cpu_read(ne_fit_preload_node)) va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); spin_lock(lock); if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va)) kmem_cache_free(vmap_area_cachep, va); } /* * Allocate a region of KVA of the specified size and alignment, within the * vstart and vend. */ static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend, int node, gfp_t gfp_mask) { struct vmap_area *va; unsigned long freed; unsigned long addr; int purged = 0; int ret; BUG_ON(!size); BUG_ON(offset_in_page(size)); BUG_ON(!is_power_of_2(align)); if (unlikely(!vmap_initialized)) return ERR_PTR(-EBUSY); might_sleep(); gfp_mask = gfp_mask & GFP_RECLAIM_MASK; va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); if (unlikely(!va)) return ERR_PTR(-ENOMEM); /* * Only scan the relevant parts containing pointers to other objects * to avoid false negatives. */ kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); retry: preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list, size, align, vstart, vend); spin_unlock(&free_vmap_area_lock); /* * If an allocation fails, the "vend" address is * returned. Therefore trigger the overflow path. */ if (unlikely(addr == vend)) goto overflow; va->va_start = addr; va->va_end = addr + size; va->vm = NULL; spin_lock(&vmap_area_lock); insert_vmap_area(va, &vmap_area_root, &vmap_area_list); spin_unlock(&vmap_area_lock); BUG_ON(!IS_ALIGNED(va->va_start, align)); BUG_ON(va->va_start < vstart); BUG_ON(va->va_end > vend); ret = kasan_populate_vmalloc(addr, size); if (ret) { free_vmap_area(va); return ERR_PTR(ret); } return va; overflow: if (!purged) { purge_vmap_area_lazy(); purged = 1; goto retry; } freed = 0; blocking_notifier_call_chain(&vmap_notify_list, 0, &freed); if (freed > 0) { purged = 0; goto retry; } if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n", size); kmem_cache_free(vmap_area_cachep, va); return ERR_PTR(-EBUSY); } int register_vmap_purge_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&vmap_notify_list, nb); } EXPORT_SYMBOL_GPL(register_vmap_purge_notifier); int unregister_vmap_purge_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&vmap_notify_list, nb); } EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); /* * lazy_max_pages is the maximum amount of virtual address space we gather up * before attempting to purge with a TLB flush. * * There is a tradeoff here: a larger number will cover more kernel page tables * and take slightly longer to purge, but it will linearly reduce the number of * global TLB flushes that must be performed. It would seem natural to scale * this number up linearly with the number of CPUs (because vmapping activity * could also scale linearly with the number of CPUs), however it is likely * that in practice, workloads might be constrained in other ways that mean * vmap activity will not scale linearly with CPUs. Also, I want to be * conservative and not introduce a big latency on huge systems, so go with * a less aggressive log scale. It will still be an improvement over the old * code, and it will be simple to change the scale factor if we find that it * becomes a problem on bigger systems. */ static unsigned long lazy_max_pages(void) { unsigned int log; log = fls(num_online_cpus()); return log * (32UL * 1024 * 1024 / PAGE_SIZE); } static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); /* * Serialize vmap purging. There is no actual critical section protected * by this lock, but we want to avoid concurrent calls for performance * reasons and to make the pcpu_get_vm_areas more deterministic. */ static DEFINE_MUTEX(vmap_purge_lock); /* for per-CPU blocks */ static void purge_fragmented_blocks_allcpus(void); /* * Purges all lazily-freed vmap areas. */ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) { unsigned long resched_threshold; struct list_head local_purge_list; struct vmap_area *va, *n_va; lockdep_assert_held(&vmap_purge_lock); spin_lock(&purge_vmap_area_lock); purge_vmap_area_root = RB_ROOT; list_replace_init(&purge_vmap_area_list, &local_purge_list); spin_unlock(&purge_vmap_area_lock); if (unlikely(list_empty(&local_purge_list))) return false; start = min(start, list_first_entry(&local_purge_list, struct vmap_area, list)->va_start); end = max(end, list_last_entry(&local_purge_list, struct vmap_area, list)->va_end); flush_tlb_kernel_range(start, end); resched_threshold = lazy_max_pages() << 1; spin_lock(&free_vmap_area_lock); list_for_each_entry_safe(va, n_va, &local_purge_list, list) { unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; unsigned long orig_start = va->va_start; unsigned long orig_end = va->va_end; /* * Finally insert or merge lazily-freed area. It is * detached and there is no need to "unlink" it from * anything. */ va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list); if (!va) continue; if (is_vmalloc_or_module_addr((void *)orig_start)) kasan_release_vmalloc(orig_start, orig_end, va->va_start, va->va_end); atomic_long_sub(nr, &vmap_lazy_nr); if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) cond_resched_lock(&free_vmap_area_lock); } spin_unlock(&free_vmap_area_lock); return true; } /* * Kick off a purge of the outstanding lazy areas. */ static void purge_vmap_area_lazy(void) { mutex_lock(&vmap_purge_lock); purge_fragmented_blocks_allcpus(); __purge_vmap_area_lazy(ULONG_MAX, 0); mutex_unlock(&vmap_purge_lock); } static void drain_vmap_area_work(struct work_struct *work) { unsigned long nr_lazy; do { mutex_lock(&vmap_purge_lock); __purge_vmap_area_lazy(ULONG_MAX, 0); mutex_unlock(&vmap_purge_lock); /* Recheck if further work is required. */ nr_lazy = atomic_long_read(&vmap_lazy_nr); } while (nr_lazy > lazy_max_pages()); } /* * Free a vmap area, caller ensuring that the area has been unmapped * and flush_cache_vunmap had been called for the correct range * previously. */ static void free_vmap_area_noflush(struct vmap_area *va) { unsigned long nr_lazy; spin_lock(&vmap_area_lock); unlink_va(va, &vmap_area_root); spin_unlock(&vmap_area_lock); nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); /* * Merge or place it to the purge tree/list. */ spin_lock(&purge_vmap_area_lock); merge_or_add_vmap_area(va, &purge_vmap_area_root, &purge_vmap_area_list); spin_unlock(&purge_vmap_area_lock); /* After this point, we may free va at any time */ if (unlikely(nr_lazy > lazy_max_pages())) schedule_work(&drain_vmap_work); } /* * Free and unmap a vmap area */ static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); vunmap_range_noflush(va->va_start, va->va_end); if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(va->va_start, va->va_end); free_vmap_area_noflush(va); } struct vmap_area *find_vmap_area(unsigned long addr) { struct vmap_area *va; spin_lock(&vmap_area_lock); va = __find_vmap_area(addr, &vmap_area_root); spin_unlock(&vmap_area_lock); return va; } /*** Per cpu kva allocator ***/ /* * vmap space is limited especially on 32 bit architectures. Ensure there is * room for at least 16 percpu vmap blocks per CPU. */ /* * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess * instead (we just need a rough idea) */ #if BITS_PER_LONG == 32 #define VMALLOC_SPACE (128UL*1024*1024) #else #define VMALLOC_SPACE (128UL*1024*1024*1024) #endif #define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) #define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ #define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ #define VMAP_BBMAP_BITS \ VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) struct vmap_block_queue { spinlock_t lock; struct list_head free; }; struct vmap_block { spinlock_t lock; struct vmap_area *va; unsigned long free, dirty; unsigned long dirty_min, dirty_max; /*< dirty range */ struct list_head free_list; struct rcu_head rcu_head; struct list_head purge; }; /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); /* * XArray of vmap blocks, indexed by address, to quickly find a vmap block * in the free path. Could get rid of this if we change the API to return a * "cookie" from alloc, to be passed to free. But no big deal yet. */ static DEFINE_XARRAY(vmap_blocks); /* * We should probably have a fallback mechanism to allocate virtual memory * out of partially filled vmap blocks. However vmap block sizing should be * fairly reasonable according to the vmalloc size, so it shouldn't be a * big problem. */ static unsigned long addr_to_vb_idx(unsigned long addr) { addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); addr /= VMAP_BLOCK_SIZE; return addr; } static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) { unsigned long addr; addr = va_start + (pages_off << PAGE_SHIFT); BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start)); return (void *)addr; } /** * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this * block. Of course pages number can't exceed VMAP_BBMAP_BITS * @order: how many 2^order pages should be occupied in newly allocated block * @gfp_mask: flags for the page level allocator * * Return: virtual address in a newly allocated block or ERR_PTR(-errno) */ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) { struct vmap_block_queue *vbq; struct vmap_block *vb; struct vmap_area *va; unsigned long vb_idx; int node, err; void *vaddr; node = numa_node_id(); vb = kmalloc_node(sizeof(struct vmap_block), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!vb)) return ERR_PTR(-ENOMEM); va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, VMALLOC_START, VMALLOC_END, node, gfp_mask); if (IS_ERR(va)) { kfree(vb); return ERR_CAST(va); } vaddr = vmap_block_vaddr(va->va_start, 0); spin_lock_init(&vb->lock); vb->va = va; /* At least something should be left free */ BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); vb->free = VMAP_BBMAP_BITS - (1UL << order); vb->dirty = 0; vb->dirty_min = VMAP_BBMAP_BITS; vb->dirty_max = 0; INIT_LIST_HEAD(&vb->free_list); vb_idx = addr_to_vb_idx(va->va_start); err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask); if (err) { kfree(vb); free_vmap_area(va); return ERR_PTR(err); } vbq = raw_cpu_ptr(&vmap_block_queue); spin_lock(&vbq->lock); list_add_tail_rcu(&vb->free_list, &vbq->free); spin_unlock(&vbq->lock); return vaddr; } static void free_vmap_block(struct vmap_block *vb) { struct vmap_block *tmp; tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start)); BUG_ON(tmp != vb); free_vmap_area_noflush(vb->va); kfree_rcu(vb, rcu_head); } static void purge_fragmented_blocks(int cpu) { LIST_HEAD(purge); struct vmap_block *vb; struct vmap_block *n_vb; struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); rcu_read_lock(); list_for_each_entry_rcu(vb, &vbq->free, free_list) { if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS)) continue; spin_lock(&vb->lock); if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { vb->free = 0; /* prevent further allocs after releasing lock */ vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ vb->dirty_min = 0; vb->dirty_max = VMAP_BBMAP_BITS; spin_lock(&vbq->lock); list_del_rcu(&vb->free_list); spin_unlock(&vbq->lock); spin_unlock(&vb->lock); list_add_tail(&vb->purge, &purge); } else spin_unlock(&vb->lock); } rcu_read_unlock(); list_for_each_entry_safe(vb, n_vb, &purge, purge) { list_del(&vb->purge); free_vmap_block(vb); } } static void purge_fragmented_blocks_allcpus(void) { int cpu; for_each_possible_cpu(cpu) purge_fragmented_blocks(cpu); } static void *vb_alloc(unsigned long size, gfp_t gfp_mask) { struct vmap_block_queue *vbq; struct vmap_block *vb; void *vaddr = NULL; unsigned int order; BUG_ON(offset_in_page(size)); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); if (WARN_ON(size == 0)) { /* * Allocating 0 bytes isn't what caller wants since * get_order(0) returns funny result. Just warn and terminate * early. */ return NULL; } order = get_order(size); rcu_read_lock(); vbq = raw_cpu_ptr(&vmap_block_queue); list_for_each_entry_rcu(vb, &vbq->free, free_list) { unsigned long pages_off; spin_lock(&vb->lock); if (vb->free < (1UL << order)) { spin_unlock(&vb->lock); continue; } pages_off = VMAP_BBMAP_BITS - vb->free; vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); vb->free -= 1UL << order; if (vb->free == 0) { spin_lock(&vbq->lock); list_del_rcu(&vb->free_list); spin_unlock(&vbq->lock); } spin_unlock(&vb->lock); break; } rcu_read_unlock(); /* Allocate new block if nothing was found */ if (!vaddr) vaddr = new_vmap_block(order, gfp_mask); return vaddr; } static void vb_free(unsigned long addr, unsigned long size) { unsigned long offset; unsigned int order; struct vmap_block *vb; BUG_ON(offset_in_page(size)); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); flush_cache_vunmap(addr, addr + size); order = get_order(size); offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr)); vunmap_range_noflush(addr, addr + size); if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(addr, addr + size); spin_lock(&vb->lock); /* Expand dirty range */ vb->dirty_min = min(vb->dirty_min, offset); vb->dirty_max = max(vb->dirty_max, offset + (1UL << order)); vb->dirty += 1UL << order; if (vb->dirty == VMAP_BBMAP_BITS) { BUG_ON(vb->free); spin_unlock(&vb->lock); free_vmap_block(vb); } else spin_unlock(&vb->lock); } static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) { int cpu; if (unlikely(!vmap_initialized)) return; might_sleep(); for_each_possible_cpu(cpu) { struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); struct vmap_block *vb; rcu_read_lock(); list_for_each_entry_rcu(vb, &vbq->free, free_list) { spin_lock(&vb->lock); if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) { unsigned long va_start = vb->va->va_start; unsigned long s, e; s = va_start + (vb->dirty_min << PAGE_SHIFT); e = va_start + (vb->dirty_max << PAGE_SHIFT); start = min(s, start); end = max(e, end); flush = 1; } spin_unlock(&vb->lock); } rcu_read_unlock(); } mutex_lock(&vmap_purge_lock); purge_fragmented_blocks_allcpus(); if (!__purge_vmap_area_lazy(start, end) && flush) flush_tlb_kernel_range(start, end); mutex_unlock(&vmap_purge_lock); } /** * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer * * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily * to amortize TLB flushing overheads. What this means is that any page you * have now, may, in a former life, have been mapped into kernel virtual * address by the vmap layer and so there might be some CPUs with TLB entries * still referencing that page (additional to the regular 1:1 kernel mapping). * * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can * be sure that none of the pages we have control over will have any aliases * from the vmap layer. */ void vm_unmap_aliases(void) { unsigned long start = ULONG_MAX, end = 0; int flush = 0; _vm_unmap_aliases(start, end, flush); } EXPORT_SYMBOL_GPL(vm_unmap_aliases); /** * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram * @mem: the pointer returned by vm_map_ram * @count: the count passed to that vm_map_ram call (cannot unmap partial) */ void vm_unmap_ram(const void *mem, unsigned int count) { unsigned long size = (unsigned long)count << PAGE_SHIFT; unsigned long addr = (unsigned long)kasan_reset_tag(mem); struct vmap_area *va; might_sleep(); BUG_ON(!addr); BUG_ON(addr < VMALLOC_START); BUG_ON(addr > VMALLOC_END); BUG_ON(!PAGE_ALIGNED(addr)); kasan_poison_vmalloc(mem, size); if (likely(count <= VMAP_MAX_ALLOC)) { debug_check_no_locks_freed(mem, size); vb_free(addr, size); return; } va = find_vmap_area(addr); BUG_ON(!va); debug_check_no_locks_freed((void *)va->va_start, (va->va_end - va->va_start)); free_unmap_vmap_area(va); } EXPORT_SYMBOL(vm_unmap_ram); /** * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) * @pages: an array of pointers to the pages to be mapped * @count: number of pages * @node: prefer to allocate data structures on this node * * If you use this function for less than VMAP_MAX_ALLOC pages, it could be * faster than vmap so it's good. But if you mix long-life and short-life * objects with vm_map_ram(), it could consume lots of address space through * fragmentation (especially on a 32bit machine). You could see failures in * the end. Please use this function for short-lived objects. * * Returns: a pointer to the address that has been mapped, or %NULL on failure */ void *vm_map_ram(struct page **pages, unsigned int count, int node) { unsigned long size = (unsigned long)count << PAGE_SHIFT; unsigned long addr; void *mem; if (likely(count <= VMAP_MAX_ALLOC)) { mem = vb_alloc(size, GFP_KERNEL); if (IS_ERR(mem)) return NULL; addr = (unsigned long)mem; } else { struct vmap_area *va; va = alloc_vmap_area(size, PAGE_SIZE, VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); if (IS_ERR(va)) return NULL; addr = va->va_start; mem = (void *)addr; } if (vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, PAGE_SHIFT) < 0) { vm_unmap_ram(mem, count); return NULL; } /* * Mark the pages as accessible, now that they are mapped. * With hardware tag-based KASAN, marking is skipped for * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL); return mem; } EXPORT_SYMBOL(vm_map_ram); static struct vm_struct *vmlist __initdata; static inline unsigned int vm_area_page_order(struct vm_struct *vm) { #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC return vm->page_order; #else return 0; #endif } static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order) { #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC vm->page_order = order; #else BUG_ON(order != 0); #endif } /** * vm_area_add_early - add vmap area early during boot * @vm: vm_struct to add * * This function is used to add fixed kernel vm area to vmlist before * vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags * should contain proper values and the other fields should be zero. * * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. */ void __init vm_area_add_early(struct vm_struct *vm) { struct vm_struct *tmp, **p; BUG_ON(vmap_initialized); for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { if (tmp->addr >= vm->addr) { BUG_ON(tmp->addr < vm->addr + vm->size); break; } else BUG_ON(tmp->addr + tmp->size > vm->addr); } vm->next = *p; *p = vm; } /** * vm_area_register_early - register vmap area early during boot * @vm: vm_struct to register * @align: requested alignment * * This function is used to register kernel vm area before * vmalloc_init() is called. @vm->size and @vm->flags should contain * proper values on entry and other fields should be zero. On return, * vm->addr contains the allocated address. * * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. */ void __init vm_area_register_early(struct vm_struct *vm, size_t align) { unsigned long addr = ALIGN(VMALLOC_START, align); struct vm_struct *cur, **p; BUG_ON(vmap_initialized); for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) { if ((unsigned long)cur->addr - addr >= vm->size) break; addr = ALIGN((unsigned long)cur->addr + cur->size, align); } BUG_ON(addr > VMALLOC_END - vm->size); vm->addr = (void *)addr; vm->next = *p; *p = vm; kasan_populate_early_vm_area_shadow(vm->addr, vm->size); } static void vmap_init_free_space(void) { unsigned long vmap_start = 1; const unsigned long vmap_end = ULONG_MAX; struct vmap_area *busy, *free; /* * B F B B B F * -|-----|.....|-----|-----|-----|.....|- * | The KVA space | * |<--------------------------------->| */ list_for_each_entry(busy, &vmap_area_list, list) { if (busy->va_start - vmap_start > 0) { free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); if (!WARN_ON_ONCE(!free)) { free->va_start = vmap_start; free->va_end = busy->va_start; insert_vmap_area_augment(free, NULL, &free_vmap_area_root, &free_vmap_area_list); } } vmap_start = busy->va_end; } if (vmap_end - vmap_start > 0) { free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); if (!WARN_ON_ONCE(!free)) { free->va_start = vmap_start; free->va_end = vmap_end; insert_vmap_area_augment(free, NULL, &free_vmap_area_root, &free_vmap_area_list); } } } void __init vmalloc_init(void) { struct vmap_area *va; struct vm_struct *tmp; int i; /* * Create the cache for vmap_area objects. */ vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); for_each_possible_cpu(i) { struct vmap_block_queue *vbq; struct vfree_deferred *p; vbq = &per_cpu(vmap_block_queue, i); spin_lock_init(&vbq->lock); INIT_LIST_HEAD(&vbq->free); p = &per_cpu(vfree_deferred, i); init_llist_head(&p->list); INIT_WORK(&p->wq, free_work); } /* Import existing vmlist entries. */ for (tmp = vmlist; tmp; tmp = tmp->next) { va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); if (WARN_ON_ONCE(!va)) continue; va->va_start = (unsigned long)tmp->addr; va->va_end = va->va_start + tmp->size; va->vm = tmp; insert_vmap_area(va, &vmap_area_root, &vmap_area_list); } /* * Now we can initialize a free vmap space. */ vmap_init_free_space(); vmap_initialized = true; } static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { vm->flags = flags; vm->addr = (void *)va->va_start; vm->size = va->va_end - va->va_start; vm->caller = caller; va->vm = vm; } static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { spin_lock(&vmap_area_lock); setup_vmalloc_vm_locked(vm, va, flags, caller); spin_unlock(&vmap_area_lock); } static void clear_vm_uninitialized_flag(struct vm_struct *vm) { /* * Before removing VM_UNINITIALIZED, * we should make sure that vm has proper values. * Pair with smp_rmb() in show_numa_info(). */ smp_wmb(); vm->flags &= ~VM_UNINITIALIZED; } static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long shift, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller) { struct vmap_area *va; struct vm_struct *area; unsigned long requested_size = size; BUG_ON(in_interrupt()); size = ALIGN(size, 1ul << shift); if (unlikely(!size)) return NULL; if (flags & VM_IOREMAP) align = 1ul << clamp_t(int, get_count_order_long(size), PAGE_SHIFT, IOREMAP_MAX_ORDER); area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!area)) return NULL; if (!(flags & VM_NO_GUARD)) size += PAGE_SIZE; va = alloc_vmap_area(size, align, start, end, node, gfp_mask); if (IS_ERR(va)) { kfree(area); return NULL; } setup_vmalloc_vm(area, va, flags, caller); /* * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a * best-effort approach, as they can be mapped outside of vmalloc code. * For VM_ALLOC mappings, the pages are marked as accessible after * getting mapped in __vmalloc_node_range(). * With hardware tag-based KASAN, marking is skipped for * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ if (!(flags & VM_ALLOC)) area->addr = kasan_unpoison_vmalloc(area->addr, requested_size, KASAN_VMALLOC_PROT_NORMAL); return area; } struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, const void *caller) { return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end, NUMA_NO_NODE, GFP_KERNEL, caller); } /** * get_vm_area - reserve a contiguous kernel virtual area * @size: size of the area * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC * * Search an area of @size in the kernel virtual mapping area, * and reserved it for out purposes. Returns the area descriptor * on success or %NULL on failure. * * Return: the area descriptor on success or %NULL on failure. */ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) { return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, VMALLOC_START, VMALLOC_END, NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0)); } struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, const void *caller) { return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, VMALLOC_START, VMALLOC_END, NUMA_NO_NODE, GFP_KERNEL, caller); } /** * find_vm_area - find a continuous kernel virtual area * @addr: base address * * Search for the kernel VM area starting at @addr, and return it. * It is up to the caller to do all required locking to keep the returned * pointer valid. * * Return: the area descriptor on success or %NULL on failure. */ struct vm_struct *find_vm_area(const void *addr) { struct vmap_area *va; va = find_vmap_area((unsigned long)addr); if (!va) return NULL; return va->vm; } /** * remove_vm_area - find and remove a continuous kernel virtual area * @addr: base address * * Search for the kernel VM area starting at @addr, and remove it. * This function returns the found VM area, but using it is NOT safe * on SMP machines, except for its size or flags. * * Return: the area descriptor on success or %NULL on failure. */ struct vm_struct *remove_vm_area(const void *addr) { struct vmap_area *va; might_sleep(); spin_lock(&vmap_area_lock); va = __find_vmap_area((unsigned long)addr, &vmap_area_root); if (va && va->vm) { struct vm_struct *vm = va->vm; va->vm = NULL; spin_unlock(&vmap_area_lock); kasan_free_module_shadow(vm); free_unmap_vmap_area(va); return vm; } spin_unlock(&vmap_area_lock); return NULL; } static inline void set_area_direct_map(const struct vm_struct *area, int (*set_direct_map)(struct page *page)) { int i; /* HUGE_VMALLOC passes small pages to set_direct_map */ for (i = 0; i < area->nr_pages; i++) if (page_address(area->pages[i])) set_direct_map(area->pages[i]); } /* Handle removing and resetting vm mappings related to the vm_struct. */ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) { unsigned long start = ULONG_MAX, end = 0; unsigned int page_order = vm_area_page_order(area); int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; int flush_dmap = 0; int i; remove_vm_area(area->addr); /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ if (!flush_reset) return; /* * If not deallocating pages, just do the flush of the VM area and * return. */ if (!deallocate_pages) { vm_unmap_aliases(); return; } /* * If execution gets here, flush the vm mapping and reset the direct * map. Find the start and end range of the direct mappings to make sure * the vm_unmap_aliases() flush includes the direct map. */ for (i = 0; i < area->nr_pages; i += 1U << page_order) { unsigned long addr = (unsigned long)page_address(area->pages[i]); if (addr) { unsigned long page_size; page_size = PAGE_SIZE << page_order; start = min(addr, start); end = max(addr + page_size, end); flush_dmap = 1; } } /* * Set direct map to something invalid so that it won't be cached if * there are any accesses after the TLB flush, then flush the TLB and * reset the direct map permissions to the default. */ set_area_direct_map(area, set_direct_map_invalid_noflush); _vm_unmap_aliases(start, end, flush_dmap); set_area_direct_map(area, set_direct_map_default_noflush); } static void __vunmap(const void *addr, int deallocate_pages) { struct vm_struct *area; if (!addr) return; if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", addr)) return; area = find_vm_area(addr); if (unlikely(!area)) { WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); return; } debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); kasan_poison_vmalloc(area->addr, get_vm_area_size(area)); if (IS_ENABLED(CONFIG_ARCH_HAS_IOREMAP_PHYS_HOOKS) && area->flags & VM_IOREMAP) iounmap_phys_range_hook(area->phys_addr, get_vm_area_size(area)); vm_remove_mappings(area, deallocate_pages); if (deallocate_pages) { int i; for (i = 0; i < area->nr_pages; i++) { struct page *page = area->pages[i]; BUG_ON(!page); if (!(area->flags & VM_MAP_PUT_PAGES)) mod_memcg_page_state(page, MEMCG_VMALLOC, -1); /* * High-order allocs for huge vmallocs are split, so * can be freed as an array of order-0 allocations */ __free_pages(page, 0); cond_resched(); } if (!(area->flags & VM_MAP_PUT_PAGES)) atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); kvfree(area->pages); } kfree(area); } static inline void __vfree_deferred(const void *addr) { /* * Use raw_cpu_ptr() because this can be called from preemptible * context. Preemption is absolutely fine here, because the llist_add() * implementation is lockless, so it works even if we are adding to * another cpu's list. schedule_work() should be fine with this too. */ struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); if (llist_add((struct llist_node *)addr, &p->list)) schedule_work(&p->wq); } /** * vfree_atomic - release memory allocated by vmalloc() * @addr: memory base address * * This one is just like vfree() but can be called in any atomic context * except NMIs. */ void vfree_atomic(const void *addr) { BUG_ON(in_nmi()); kmemleak_free(addr); if (!addr) return; __vfree_deferred(addr); } static void __vfree(const void *addr) { if (unlikely(in_interrupt())) __vfree_deferred(addr); else __vunmap(addr, 1); } /** * vfree - Release memory allocated by vmalloc() * @addr: Memory base address * * Free the virtually continuous memory area starting at @addr, as obtained * from one of the vmalloc() family of APIs. This will usually also free the * physical memory underlying the virtual allocation, but that memory is * reference counted, so it will not be freed until the last user goes away. * * If @addr is NULL, no operation is performed. * * Context: * May sleep if called *not* from interrupt context. * Must not be called in NMI context (strictly speaking, it could be * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling * conventions for vfree() arch-dependent would be a really bad idea). */ void vfree(const void *addr) { BUG_ON(in_nmi()); kmemleak_free(addr); might_sleep_if(!in_interrupt()); if (!addr) return; __vfree(addr); } EXPORT_SYMBOL(vfree); /** * vunmap - release virtual mapping obtained by vmap() * @addr: memory base address * * Free the virtually contiguous memory area starting at @addr, * which was created from the page array passed to vmap(). * * Must not be called in interrupt context. */ void vunmap(const void *addr) { BUG_ON(in_interrupt()); might_sleep(); if (addr) __vunmap(addr, 0); } EXPORT_SYMBOL(vunmap); /** * vmap - map an array of pages into virtually contiguous space * @pages: array of page pointers * @count: number of pages to map * @flags: vm_area->flags * @prot: page protection for the mapping * * Maps @count pages from @pages into contiguous kernel virtual space. * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself * (which must be kmalloc or vmalloc memory) and one reference per pages in it * are transferred from the caller to vmap(), and will be freed / dropped when * vfree() is called on the return value. * * Return: the address of the area or %NULL on failure */ void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) { struct vm_struct *area; unsigned long addr; unsigned long size; /* In bytes */ might_sleep(); /* * Your top guard is someone else's bottom guard. Not having a top * guard compromises someone else's mappings too. */ if (WARN_ON_ONCE(flags & VM_NO_GUARD)) flags &= ~VM_NO_GUARD; if (count > totalram_pages()) return NULL; size = (unsigned long)count << PAGE_SHIFT; area = get_vm_area_caller(size, flags, __builtin_return_address(0)); if (!area) return NULL; addr = (unsigned long)area->addr; if (vmap_pages_range(addr, addr + size, pgprot_nx(prot), pages, PAGE_SHIFT) < 0) { vunmap(area->addr); return NULL; } if (flags & VM_MAP_PUT_PAGES) { area->pages = pages; area->nr_pages = count; } return area->addr; } EXPORT_SYMBOL(vmap); #ifdef CONFIG_VMAP_PFN struct vmap_pfn_data { unsigned long *pfns; pgprot_t prot; unsigned int idx; }; static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private) { struct vmap_pfn_data *data = private; if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx]))) return -EINVAL; *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot)); return 0; } /** * vmap_pfn - map an array of PFNs into virtually contiguous space * @pfns: array of PFNs * @count: number of pages to map * @prot: page protection for the mapping * * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns * the start address of the mapping. */ void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot) { struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) }; struct vm_struct *area; area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP, __builtin_return_address(0)); if (!area) return NULL; if (apply_to_page_range(&init_mm, (unsigned long)area->addr, count * PAGE_SIZE, vmap_pfn_apply, &data)) { free_vm_area(area); return NULL; } flush_cache_vmap((unsigned long)area->addr, (unsigned long)area->addr + count * PAGE_SIZE); return area->addr; } EXPORT_SYMBOL_GPL(vmap_pfn); #endif /* CONFIG_VMAP_PFN */ static inline unsigned int vm_area_alloc_pages(gfp_t gfp, int nid, unsigned int order, unsigned int nr_pages, struct page **pages) { unsigned int nr_allocated = 0; gfp_t alloc_gfp = gfp; bool nofail = gfp & __GFP_NOFAIL; struct page *page; int i; /* * For order-0 pages we make use of bulk allocator, if * the page array is partly or not at all populated due * to fails, fallback to a single page allocator that is * more permissive. */ if (!order) { /* bulk allocator doesn't support nofail req. officially */ gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL; while (nr_allocated < nr_pages) { unsigned int nr, nr_pages_request; /* * A maximum allowed request is hard-coded and is 100 * pages per call. That is done in order to prevent a * long preemption off scenario in the bulk-allocator * so the range is [1:100]. */ nr_pages_request = min(100U, nr_pages - nr_allocated); /* memory allocation should consider mempolicy, we can't * wrongly use nearest node when nid == NUMA_NO_NODE, * otherwise memory may be allocated in only one node, * but mempolicy wants to alloc memory by interleaving. */ if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) nr = alloc_pages_bulk_array_mempolicy(bulk_gfp, nr_pages_request, pages + nr_allocated); else nr = alloc_pages_bulk_array_node(bulk_gfp, nid, nr_pages_request, pages + nr_allocated); nr_allocated += nr; cond_resched(); /* * If zero or pages were obtained partly, * fallback to a single page allocator. */ if (nr != nr_pages_request) break; } } else if (gfp & __GFP_NOFAIL) { /* * Higher order nofail allocations are really expensive and * potentially dangerous (pre-mature OOM, disruptive reclaim * and compaction etc. */ alloc_gfp &= ~__GFP_NOFAIL; } /* High-order pages or fallback path if "bulk" fails. */ while (nr_allocated < nr_pages) { if (!nofail && fatal_signal_pending(current)) break; if (nid == NUMA_NO_NODE) page = alloc_pages(alloc_gfp, order); else page = alloc_pages_node(nid, alloc_gfp, order); if (unlikely(!page)) break; /* * Higher order allocations must be able to be treated as * indepdenent small pages by callers (as they can with * small-page vmallocs). Some drivers do their own refcounting * on vmalloc_to_page() pages, some use page->mapping, * page->lru, etc. */ if (order) split_page(page, order); /* * Careful, we allocate and map page-order pages, but * tracking is done per PAGE_SIZE page so as to keep the * vm_struct APIs independent of the physical/mapped size. */ for (i = 0; i < (1U << order); i++) pages[nr_allocated + i] = page + i; cond_resched(); nr_allocated += 1U << order; } return nr_allocated; } static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, unsigned int page_shift, int node) { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; bool nofail = gfp_mask & __GFP_NOFAIL; unsigned long addr = (unsigned long)area->addr; unsigned long size = get_vm_area_size(area); unsigned long array_size; unsigned int nr_small_pages = size >> PAGE_SHIFT; unsigned int page_order; unsigned int flags; int ret; array_size = (unsigned long)nr_small_pages * sizeof(struct page *); gfp_mask |= __GFP_NOWARN; if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) gfp_mask |= __GFP_HIGHMEM; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { area->pages = __vmalloc_node(array_size, 1, nested_gfp, node, area->caller); } else { area->pages = kmalloc_node(array_size, nested_gfp, node); } if (!area->pages) { warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to allocated page array size %lu", nr_small_pages * PAGE_SIZE, array_size); free_vm_area(area); return NULL; } set_vm_area_page_order(area, page_shift - PAGE_SHIFT); page_order = vm_area_page_order(area); area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN, node, page_order, nr_small_pages, area->pages); atomic_long_add(area->nr_pages, &nr_vmalloc_pages); if (gfp_mask & __GFP_ACCOUNT) { int i; for (i = 0; i < area->nr_pages; i++) mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1); } /* * If not enough pages were obtained to accomplish an * allocation request, free them via __vfree() if any. */ if (area->nr_pages != nr_small_pages) { /* vm_area_alloc_pages() can also fail due to a fatal signal */ if (!fatal_signal_pending(current)) warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, page order %u, failed to allocate pages", area->nr_pages * PAGE_SIZE, page_order); goto fail; } /* * page tables allocations ignore external gfp mask, enforce it * by the scope API */ if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) flags = memalloc_nofs_save(); else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) flags = memalloc_noio_save(); do { ret = vmap_pages_range(addr, addr + size, prot, area->pages, page_shift); if (nofail && (ret < 0)) schedule_timeout_uninterruptible(1); } while (nofail && (ret < 0)); if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) memalloc_nofs_restore(flags); else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) memalloc_noio_restore(flags); if (ret < 0) { warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to map pages", area->nr_pages * PAGE_SIZE); goto fail; } return area->addr; fail: __vfree(area->addr); return NULL; } /** * __vmalloc_node_range - allocate virtually contiguous memory * @size: allocation size * @align: desired alignment * @start: vm area range start * @end: vm area range end * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) * @node: node to use for allocation or NUMA_NO_NODE * @caller: caller's return address * * Allocate enough pages to cover @size from the page level * allocator with @gfp_mask flags. Please note that the full set of gfp * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all * supported. * Zone modifiers are not supported. From the reclaim modifiers * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported) * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and * __GFP_RETRY_MAYFAIL are not supported). * * __GFP_NOWARN can be used to suppress failures messages. * * Map them into contiguous kernel virtual space, using a pagetable * protection of @prot. * * Return: the address of the area or %NULL on failure */ void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller) { struct vm_struct *area; void *ret; kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE; unsigned long real_size = size; unsigned long real_align = align; unsigned int shift = PAGE_SHIFT; if (WARN_ON_ONCE(!size)) return NULL; if ((size >> PAGE_SHIFT) > totalram_pages()) { warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, exceeds total pages", real_size); return NULL; } if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) { unsigned long size_per_node; /* * Try huge pages. Only try for PAGE_KERNEL allocations, * others like modules don't yet expect huge pages in * their allocations due to apply_to_page_range not * supporting them. */ size_per_node = size; if (node == NUMA_NO_NODE) size_per_node /= num_online_nodes(); if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE) shift = PMD_SHIFT; else shift = arch_vmap_pte_supported_shift(size_per_node); align = max(real_align, 1UL << shift); size = ALIGN(real_size, 1UL << shift); } again: area = __get_vm_area_node(real_size, align, shift, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); if (!area) { bool nofail = gfp_mask & __GFP_NOFAIL; warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, vm_struct allocation failed%s", real_size, (nofail) ? ". Retrying." : ""); if (nofail) { schedule_timeout_uninterruptible(1); goto again; } goto fail; } /* * Prepare arguments for __vmalloc_area_node() and * kasan_unpoison_vmalloc(). */ if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) { if (kasan_hw_tags_enabled()) { /* * Modify protection bits to allow tagging. * This must be done before mapping. */ prot = arch_vmap_pgprot_tagged(prot); /* * Skip page_alloc poisoning and zeroing for physical * pages backing VM_ALLOC mapping. Memory is instead * poisoned and zeroed by kasan_unpoison_vmalloc(). */ gfp_mask |= __GFP_SKIP_KASAN_UNPOISON | __GFP_SKIP_ZERO; } /* Take note that the mapping is PAGE_KERNEL. */ kasan_flags |= KASAN_VMALLOC_PROT_NORMAL; } /* Allocate physical pages and map them into vmalloc space. */ ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node); if (!ret) goto fail; /* * Mark the pages as accessible, now that they are mapped. * The condition for setting KASAN_VMALLOC_INIT should complement the * one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check * to make sure that memory is initialized under the same conditions. * Tag-based KASAN modes only assign tags to normal non-executable * allocations, see __kasan_unpoison_vmalloc(). */ kasan_flags |= KASAN_VMALLOC_VM_ALLOC; if (!want_init_on_free() && want_init_on_alloc(gfp_mask) && (gfp_mask & __GFP_SKIP_ZERO)) kasan_flags |= KASAN_VMALLOC_INIT; /* KASAN_VMALLOC_PROT_NORMAL already set if required. */ area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags); /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED * flag. It means that vm_struct is not fully initialized. * Now, it is fully initialized, so remove this flag here. */ clear_vm_uninitialized_flag(area); size = PAGE_ALIGN(size); if (!(vm_flags & VM_DEFER_KMEMLEAK)) kmemleak_vmalloc(area, size, gfp_mask); return area->addr; fail: if (shift > PAGE_SHIFT) { shift = PAGE_SHIFT; align = real_align; size = real_size; goto again; } return NULL; } /** * __vmalloc_node - allocate virtually contiguous memory * @size: allocation size * @align: desired alignment * @gfp_mask: flags for the page level allocator * @node: node to use for allocation or NUMA_NO_NODE * @caller: caller's return address * * Allocate enough pages to cover @size from the page level allocator with * @gfp_mask flags. Map them into contiguous kernel virtual space. * * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL * and __GFP_NOFAIL are not supported * * Any use of gfp flags outside of GFP_KERNEL should be consulted * with mm people. * * Return: pointer to the allocated memory or %NULL on error */ void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) { return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, 0, node, caller); } /* * This is only for performance analysis of vmalloc and stress purpose. * It is required by vmalloc test module, therefore do not use it other * than that. */ #ifdef CONFIG_TEST_VMALLOC_MODULE EXPORT_SYMBOL_GPL(__vmalloc_node); #endif void *__vmalloc(unsigned long size, gfp_t gfp_mask) { return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(__vmalloc); /** * vmalloc - allocate virtually contiguous memory * @size: allocation size * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * * For tight control over page level allocator and protection flags * use __vmalloc() instead. * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc(unsigned long size) { return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc); /** * vmalloc_huge - allocate virtually contiguous memory, allow huge pages * @size: allocation size * @gfp_mask: flags for the page level allocator * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * If @size is greater than or equal to PMD_SIZE, allow using * huge pages for the memory * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) { return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL_GPL(vmalloc_huge); /** * vzalloc - allocate virtually contiguous memory with zero fill * @size: allocation size * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * The memory allocated is set to zero. * * For tight control over page level allocator and protection flags * use __vmalloc() instead. * * Return: pointer to the allocated memory or %NULL on error */ void *vzalloc(unsigned long size) { return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vzalloc); /** * vmalloc_user - allocate zeroed virtually contiguous memory for userspace * @size: allocation size * * The resulting memory area is zeroed so it can be mapped to userspace * without leaking data. * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_user(unsigned long size) { return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, VM_USERMAP, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_user); /** * vmalloc_node - allocate memory on a specific node * @size: allocation size * @node: numa node * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * * For tight control over page level allocator and protection flags * use __vmalloc() instead. * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_node(unsigned long size, int node) { return __vmalloc_node(size, 1, GFP_KERNEL, node, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_node); /** * vzalloc_node - allocate memory on a specific node with zero fill * @size: allocation size * @node: numa node * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * The memory allocated is set to zero. * * Return: pointer to the allocated memory or %NULL on error */ void *vzalloc_node(unsigned long size, int node) { return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node, __builtin_return_address(0)); } EXPORT_SYMBOL(vzalloc_node); #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) #define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL) #else /* * 64b systems should always have either DMA or DMA32 zones. For others * GFP_DMA32 should do the right thing and use the normal zone. */ #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #endif /** * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) * @size: allocation size * * Allocate enough 32bit PA addressable pages to cover @size from the * page level allocator and map them into contiguous kernel virtual space. * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_32(unsigned long size) { return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32); /** * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory * @size: allocation size * * The resulting memory area is 32bit addressable and zeroed so it can be * mapped to userspace without leaking data. * * Return: pointer to the allocated memory or %NULL on error */ void *vmalloc_32_user(unsigned long size) { return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, VM_USERMAP, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32_user); /* * small helper routine , copy contents to buf from addr. * If the page is not present, fill zero. */ static int aligned_vread(char *buf, char *addr, unsigned long count) { struct page *p; int copied = 0; while (count) { unsigned long offset, length; offset = offset_in_page(addr); length = PAGE_SIZE - offset; if (length > count) length = count; p = vmalloc_to_page(addr); /* * To do safe access to this _mapped_ area, we need * lock. But adding lock here means that we need to add * overhead of vmalloc()/vfree() calls for this _debug_ * interface, rarely used. Instead of that, we'll use * kmap() and get small overhead in this access function. */ if (p) { /* We can expect USER0 is not used -- see vread() */ void *map = kmap_atomic(p); memcpy(buf, map + offset, length); kunmap_atomic(map); } else memset(buf, 0, length); addr += length; buf += length; copied += length; count -= length; } return copied; } /** * vread() - read vmalloc area in a safe way. * @buf: buffer for reading data * @addr: vm address. * @count: number of bytes to be read. * * This function checks that addr is a valid vmalloc'ed area, and * copy data from that area to a given buffer. If the given memory range * of [addr...addr+count) includes some valid address, data is copied to * proper area of @buf. If there are memory holes, they'll be zero-filled. * IOREMAP area is treated as memory hole and no copy is done. * * If [addr...addr+count) doesn't includes any intersects with alive * vm_struct area, returns 0. @buf should be kernel's buffer. * * Note: In usual ops, vread() is never necessary because the caller * should know vmalloc() area is valid and can use memcpy(). * This is for routines which have to access vmalloc area without * any information, as /proc/kcore. * * Return: number of bytes for which addr and buf should be increased * (same number as @count) or %0 if [addr...addr+count) doesn't * include any intersection with valid vmalloc area */ long vread(char *buf, char *addr, unsigned long count) { struct vmap_area *va; struct vm_struct *vm; char *vaddr, *buf_start = buf; unsigned long buflen = count; unsigned long n; addr = kasan_reset_tag(addr); /* Don't allow overflow */ if ((unsigned long) addr + count < count) count = -(unsigned long) addr; spin_lock(&vmap_area_lock); va = find_vmap_area_exceed_addr((unsigned long)addr); if (!va) goto finished; /* no intersects with alive vmap_area */ if ((unsigned long)addr + count <= va->va_start) goto finished; list_for_each_entry_from(va, &vmap_area_list, list) { if (!count) break; if (!va->vm) continue; vm = va->vm; vaddr = (char *) vm->addr; if (addr >= vaddr + get_vm_area_size(vm)) continue; while (addr < vaddr) { if (count == 0) goto finished; *buf = '\0'; buf++; addr++; count--; } n = vaddr + get_vm_area_size(vm) - addr; if (n > count) n = count; if (!(vm->flags & VM_IOREMAP)) aligned_vread(buf, addr, n); else /* IOREMAP area is treated as memory hole */ memset(buf, 0, n); buf += n; addr += n; count -= n; } finished: spin_unlock(&vmap_area_lock); if (buf == buf_start) return 0; /* zero-fill memory holes */ if (buf != buf_start + buflen) memset(buf, 0, buflen - (buf - buf_start)); return buflen; } /** * remap_vmalloc_range_partial - map vmalloc pages to userspace * @vma: vma to cover * @uaddr: target user address to start at * @kaddr: virtual address of vmalloc kernel memory * @pgoff: offset from @kaddr to start at * @size: size of map area * * Returns: 0 for success, -Exxx on failure * * This function checks that @kaddr is a valid vmalloc'ed area, * and that it is big enough to cover the range starting at * @uaddr in @vma. Will return failure if that criteria isn't * met. * * Similar to remap_pfn_range() (see mm/memory.c) */ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, void *kaddr, unsigned long pgoff, unsigned long size) { struct vm_struct *area; unsigned long off; unsigned long end_index; if (check_shl_overflow(pgoff, PAGE_SHIFT, &off)) return -EINVAL; size = PAGE_ALIGN(size); if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr)) return -EINVAL; area = find_vm_area(kaddr); if (!area) return -EINVAL; if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT))) return -EINVAL; if (check_add_overflow(size, off, &end_index) || end_index > get_vm_area_size(area)) return -EINVAL; kaddr += off; do { struct page *page = vmalloc_to_page(kaddr); int ret; ret = vm_insert_page(vma, uaddr, page); if (ret) return ret; uaddr += PAGE_SIZE; kaddr += PAGE_SIZE; size -= PAGE_SIZE; } while (size > 0); vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); return 0; } /** * remap_vmalloc_range - map vmalloc pages to userspace * @vma: vma to cover (map full range of vma) * @addr: vmalloc memory * @pgoff: number of pages into addr before first page to map * * Returns: 0 for success, -Exxx on failure * * This function checks that addr is a valid vmalloc'ed area, and * that it is big enough to cover the vma. Will return failure if * that criteria isn't met. * * Similar to remap_pfn_range() (see mm/memory.c) */ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff) { return remap_vmalloc_range_partial(vma, vma->vm_start, addr, pgoff, vma->vm_end - vma->vm_start); } EXPORT_SYMBOL(remap_vmalloc_range); void free_vm_area(struct vm_struct *area) { struct vm_struct *ret; ret = remove_vm_area(area->addr); BUG_ON(ret != area); kfree(area); } EXPORT_SYMBOL_GPL(free_vm_area); #ifdef CONFIG_SMP static struct vmap_area *node_to_va(struct rb_node *n) { return rb_entry_safe(n, struct vmap_area, rb_node); } /** * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to * @addr: target address * * Returns: vmap_area if it is found. If there is no such area * the first highest(reverse order) vmap_area is returned * i.e. va->va_start < addr && va->va_end < addr or NULL * if there are no any areas before @addr. */ static struct vmap_area * pvm_find_va_enclose_addr(unsigned long addr) { struct vmap_area *va, *tmp; struct rb_node *n; n = free_vmap_area_root.rb_node; va = NULL; while (n) { tmp = rb_entry(n, struct vmap_area, rb_node); if (tmp->va_start <= addr) { va = tmp; if (tmp->va_end >= addr) break; n = n->rb_right; } else { n = n->rb_left; } } return va; } /** * pvm_determine_end_from_reverse - find the highest aligned address * of free block below VMALLOC_END * @va: * in - the VA we start the search(reverse order); * out - the VA with the highest aligned end address. * @align: alignment for required highest address * * Returns: determined end address within vmap_area */ static unsigned long pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align) { unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); unsigned long addr; if (likely(*va)) { list_for_each_entry_from_reverse((*va), &free_vmap_area_list, list) { addr = min((*va)->va_end & ~(align - 1), vmalloc_end); if ((*va)->va_start < addr) return addr; } } return 0; } /** * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator * @offsets: array containing offset of each area * @sizes: array containing size of each area * @nr_vms: the number of areas to allocate * @align: alignment, all entries in @offsets and @sizes must be aligned to this * * Returns: kmalloc'd vm_struct pointer array pointing to allocated * vm_structs on success, %NULL on failure * * Percpu allocator wants to use congruent vm areas so that it can * maintain the offsets among percpu areas. This function allocates * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to * be scattered pretty far, distance between two areas easily going up * to gigabytes. To avoid interacting with regular vmallocs, these * areas are allocated from top. * * Despite its complicated look, this allocator is rather simple. It * does everything top-down and scans free blocks from the end looking * for matching base. While scanning, if any of the areas do not fit the * base address is pulled down to fit the area. Scanning is repeated till * all the areas fit and then all necessary data structures are inserted * and the result is returned. */ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, size_t align) { const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); struct vmap_area **vas, *va; struct vm_struct **vms; int area, area2, last_area, term_area; unsigned long base, start, size, end, last_end, orig_start, orig_end; bool purged = false; /* verify parameters and allocate data structures */ BUG_ON(offset_in_page(align) || !is_power_of_2(align)); for (last_area = 0, area = 0; area < nr_vms; area++) { start = offsets[area]; end = start + sizes[area]; /* is everything aligned properly? */ BUG_ON(!IS_ALIGNED(offsets[area], align)); BUG_ON(!IS_ALIGNED(sizes[area], align)); /* detect the area with the highest address */ if (start > offsets[last_area]) last_area = area; for (area2 = area + 1; area2 < nr_vms; area2++) { unsigned long start2 = offsets[area2]; unsigned long end2 = start2 + sizes[area2]; BUG_ON(start2 < end && start < end2); } } last_end = offsets[last_area] + sizes[last_area]; if (vmalloc_end - vmalloc_start < last_end) { WARN_ON(true); return NULL; } vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL); vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL); if (!vas || !vms) goto err_free2; for (area = 0; area < nr_vms; area++) { vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL); vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); if (!vas[area] || !vms[area]) goto err_free; } retry: spin_lock(&free_vmap_area_lock); /* start scanning - we scan from the top, begin with the last area */ area = term_area = last_area; start = offsets[area]; end = start + sizes[area]; va = pvm_find_va_enclose_addr(vmalloc_end); base = pvm_determine_end_from_reverse(&va, align) - end; while (true) { /* * base might have underflowed, add last_end before * comparing. */ if (base + last_end < vmalloc_start + last_end) goto overflow; /* * Fitting base has not been found. */ if (va == NULL) goto overflow; /* * If required width exceeds current VA block, move * base downwards and then recheck. */ if (base + end > va->va_end) { base = pvm_determine_end_from_reverse(&va, align) - end; term_area = area; continue; } /* * If this VA does not fit, move base downwards and recheck. */ if (base + start < va->va_start) { va = node_to_va(rb_prev(&va->rb_node)); base = pvm_determine_end_from_reverse(&va, align) - end; term_area = area; continue; } /* * This area fits, move on to the previous one. If * the previous one is the terminal one, we're done. */ area = (area + nr_vms - 1) % nr_vms; if (area == term_area) break; start = offsets[area]; end = start + sizes[area]; va = pvm_find_va_enclose_addr(base + end); } /* we've found a fitting base, insert all va's */ for (area = 0; area < nr_vms; area++) { int ret; start = base + offsets[area]; size = sizes[area]; va = pvm_find_va_enclose_addr(start); if (WARN_ON_ONCE(va == NULL)) /* It is a BUG(), but trigger recovery instead. */ goto recovery; ret = adjust_va_to_fit_type(&free_vmap_area_root, &free_vmap_area_list, va, start, size); if (WARN_ON_ONCE(unlikely(ret))) /* It is a BUG(), but trigger recovery instead. */ goto recovery; /* Allocated area. */ va = vas[area]; va->va_start = start; va->va_end = start + size; } spin_unlock(&free_vmap_area_lock); /* populate the kasan shadow space */ for (area = 0; area < nr_vms; area++) { if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) goto err_free_shadow; } /* insert all vm's */ spin_lock(&vmap_area_lock); for (area = 0; area < nr_vms; area++) { insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list); setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC, pcpu_get_vm_areas); } spin_unlock(&vmap_area_lock); /* * Mark allocated areas as accessible. Do it now as a best-effort * approach, as they can be mapped outside of vmalloc code. * With hardware tag-based KASAN, marking is skipped for * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ for (area = 0; area < nr_vms; area++) vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr, vms[area]->size, KASAN_VMALLOC_PROT_NORMAL); kfree(vas); return vms; recovery: /* * Remove previously allocated areas. There is no * need in removing these areas from the busy tree, * because they are inserted only on the final step * and when pcpu_get_vm_areas() is success. */ while (area--) { orig_start = vas[area]->va_start; orig_end = vas[area]->va_end; va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, &free_vmap_area_list); if (va) kasan_release_vmalloc(orig_start, orig_end, va->va_start, va->va_end); vas[area] = NULL; } overflow: spin_unlock(&free_vmap_area_lock); if (!purged) { purge_vmap_area_lazy(); purged = true; /* Before "retry", check if we recover. */ for (area = 0; area < nr_vms; area++) { if (vas[area]) continue; vas[area] = kmem_cache_zalloc( vmap_area_cachep, GFP_KERNEL); if (!vas[area]) goto err_free; } goto retry; } err_free: for (area = 0; area < nr_vms; area++) { if (vas[area]) kmem_cache_free(vmap_area_cachep, vas[area]); kfree(vms[area]); } err_free2: kfree(vas); kfree(vms); return NULL; err_free_shadow: spin_lock(&free_vmap_area_lock); /* * We release all the vmalloc shadows, even the ones for regions that * hadn't been successfully added. This relies on kasan_release_vmalloc * being able to tolerate this case. */ for (area = 0; area < nr_vms; area++) { orig_start = vas[area]->va_start; orig_end = vas[area]->va_end; va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, &free_vmap_area_list); if (va) kasan_release_vmalloc(orig_start, orig_end, va->va_start, va->va_end); vas[area] = NULL; kfree(vms[area]); } spin_unlock(&free_vmap_area_lock); kfree(vas); kfree(vms); return NULL; } /** * pcpu_free_vm_areas - free vmalloc areas for percpu allocator * @vms: vm_struct pointer array returned by pcpu_get_vm_areas() * @nr_vms: the number of allocated areas * * Free vm_structs and the array allocated by pcpu_get_vm_areas(). */ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) { int i; for (i = 0; i < nr_vms; i++) free_vm_area(vms[i]); kfree(vms); } #endif /* CONFIG_SMP */ #ifdef CONFIG_PRINTK bool vmalloc_dump_obj(void *object) { void *objp = (void *)PAGE_ALIGN((unsigned long)object); const void *caller; struct vm_struct *vm; struct vmap_area *va; unsigned long addr; unsigned int nr_pages; if (!spin_trylock(&vmap_area_lock)) return false; va = __find_vmap_area((unsigned long)objp, &vmap_area_root); if (!va) { spin_unlock(&vmap_area_lock); return false; } vm = va->vm; if (!vm) { spin_unlock(&vmap_area_lock); return false; } addr = (unsigned long)vm->addr; caller = vm->caller; nr_pages = vm->nr_pages; spin_unlock(&vmap_area_lock); pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n", nr_pages, addr, caller); return true; } #endif #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) __acquires(&vmap_purge_lock) __acquires(&vmap_area_lock) { mutex_lock(&vmap_purge_lock); spin_lock(&vmap_area_lock); return seq_list_start(&vmap_area_list, *pos); } static void *s_next(struct seq_file *m, void *p, loff_t *pos) { return seq_list_next(p, &vmap_area_list, pos); } static void s_stop(struct seq_file *m, void *p) __releases(&vmap_area_lock) __releases(&vmap_purge_lock) { spin_unlock(&vmap_area_lock); mutex_unlock(&vmap_purge_lock); } static void show_numa_info(struct seq_file *m, struct vm_struct *v) { if (IS_ENABLED(CONFIG_NUMA)) { unsigned int nr, *counters = m->private; unsigned int step = 1U << vm_area_page_order(v); if (!counters) return; if (v->flags & VM_UNINITIALIZED) return; /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ smp_rmb(); memset(counters, 0, nr_node_ids * sizeof(unsigned int)); for (nr = 0; nr < v->nr_pages; nr += step) counters[page_to_nid(v->pages[nr])] += step; for_each_node_state(nr, N_HIGH_MEMORY) if (counters[nr]) seq_printf(m, " N%u=%u", nr, counters[nr]); } } static void show_purge_info(struct seq_file *m) { struct vmap_area *va; spin_lock(&purge_vmap_area_lock); list_for_each_entry(va, &purge_vmap_area_list, list) { seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", (void *)va->va_start, (void *)va->va_end, va->va_end - va->va_start); } spin_unlock(&purge_vmap_area_lock); } static int s_show(struct seq_file *m, void *p) { struct vmap_area *va; struct vm_struct *v; va = list_entry(p, struct vmap_area, list); /* * s_show can encounter race with remove_vm_area, !vm on behalf * of vmap area is being tear down or vm_map_ram allocation. */ if (!va->vm) { seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", (void *)va->va_start, (void *)va->va_end, va->va_end - va->va_start); goto final; } v = va->vm; seq_printf(m, "0x%pK-0x%pK %7ld", v->addr, v->addr + v->size, v->size); if (v->caller) seq_printf(m, " %pS", v->caller); if (v->nr_pages) seq_printf(m, " pages=%d", v->nr_pages); if (v->phys_addr) seq_printf(m, " phys=%pa", &v->phys_addr); if (v->flags & VM_IOREMAP) seq_puts(m, " ioremap"); if (v->flags & VM_ALLOC) seq_puts(m, " vmalloc"); if (v->flags & VM_MAP) seq_puts(m, " vmap"); if (v->flags & VM_USERMAP) seq_puts(m, " user"); if (v->flags & VM_DMA_COHERENT) seq_puts(m, " dma-coherent"); if (is_vmalloc_addr(v->pages)) seq_puts(m, " vpages"); show_numa_info(m, v); seq_putc(m, '\n'); /* * As a final step, dump "unpurged" areas. */ final: if (list_is_last(&va->list, &vmap_area_list)) show_purge_info(m); return 0; } static const struct seq_operations vmalloc_op = { .start = s_start, .next = s_next, .stop = s_stop, .show = s_show, }; static int __init proc_vmalloc_init(void) { if (IS_ENABLED(CONFIG_NUMA)) proc_create_seq_private("vmallocinfo", 0400, NULL, &vmalloc_op, nr_node_ids * sizeof(unsigned int), NULL); else proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op); return 0; } module_init(proc_vmalloc_init); #endif |
165 26 145 35 146 146 146 35 35 28 35 27 27 28 28 28 28 28 27 28 157 52 26 26 157 157 156 26 1 25 25 26 26 157 157 157 2461 2462 157 27 27 27 27 27 27 26 26 26 25 25 26 27 27 157 1 146 145 5 5 5 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 | // SPDX-License-Identifier: GPL-2.0 /* * drivers/base/devres.c - device resource management * * Copyright (c) 2006 SUSE Linux Products GmbH * Copyright (c) 2006 Tejun Heo <teheo@suse.de> */ #include <linux/device.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/percpu.h> #include <asm/sections.h> #include "base.h" #include "trace.h" struct devres_node { struct list_head entry; dr_release_t release; const char *name; size_t size; }; struct devres { struct devres_node node; /* * Some archs want to perform DMA into kmalloc caches * and need a guaranteed alignment larger than * the alignment of a 64-bit integer. * Thus we use ARCH_KMALLOC_MINALIGN here and get exactly the same * buffer alignment as if it was allocated by plain kmalloc(). */ u8 __aligned(ARCH_KMALLOC_MINALIGN) data[]; }; struct devres_group { struct devres_node node[2]; void *id; int color; /* -- 8 pointers */ }; static void set_node_dbginfo(struct devres_node *node, const char *name, size_t size) { node->name = name; node->size = size; } #ifdef CONFIG_DEBUG_DEVRES static int log_devres = 0; module_param_named(log, log_devres, int, S_IRUGO | S_IWUSR); static void devres_dbg(struct device *dev, struct devres_node *node, const char *op) { if (unlikely(log_devres)) dev_err(dev, "DEVRES %3s %p %s (%zu bytes)\n", op, node, node->name, node->size); } #else /* CONFIG_DEBUG_DEVRES */ #define devres_dbg(dev, node, op) do {} while (0) #endif /* CONFIG_DEBUG_DEVRES */ static void devres_log(struct device *dev, struct devres_node *node, const char *op) { trace_devres_log(dev, op, node, node->name, node->size); devres_dbg(dev, node, op); } /* * Release functions for devres group. These callbacks are used only * for identification. */ static void group_open_release(struct device *dev, void *res) { /* noop */ } static void group_close_release(struct device *dev, void *res) { /* noop */ } static struct devres_group * node_to_group(struct devres_node *node) { if (node->release == &group_open_release) return container_of(node, struct devres_group, node[0]); if (node->release == &group_close_release) return container_of(node, struct devres_group, node[1]); return NULL; } static bool check_dr_size(size_t size, size_t *tot_size) { /* We must catch any near-SIZE_MAX cases that could overflow. */ if (unlikely(check_add_overflow(sizeof(struct devres), size, tot_size))) return false; return true; } static __always_inline struct devres * alloc_dr(dr_release_t release, size_t size, gfp_t gfp, int nid) { size_t tot_size; struct devres *dr; if (!check_dr_size(size, &tot_size)) return NULL; dr = kmalloc_node_track_caller(tot_size, gfp, nid); if (unlikely(!dr)) return NULL; /* No need to clear memory twice */ if (!(gfp & __GFP_ZERO)) memset(dr, 0, offsetof(struct devres, data)); INIT_LIST_HEAD(&dr->node.entry); dr->node.release = release; return dr; } static void add_dr(struct device *dev, struct devres_node *node) { devres_log(dev, node, "ADD"); BUG_ON(!list_empty(&node->entry)); list_add_tail(&node->entry, &dev->devres_head); } static void replace_dr(struct device *dev, struct devres_node *old, struct devres_node *new) { devres_log(dev, old, "REPLACE"); BUG_ON(!list_empty(&new->entry)); list_replace(&old->entry, &new->entry); } /** * __devres_alloc_node - Allocate device resource data * @release: Release function devres will be associated with * @size: Allocation size * @gfp: Allocation flags * @nid: NUMA node * @name: Name of the resource * * Allocate devres of @size bytes. The allocated area is zeroed, then * associated with @release. The returned pointer can be passed to * other devres_*() functions. * * RETURNS: * Pointer to allocated devres on success, NULL on failure. */ void *__devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp, int nid, const char *name) { struct devres *dr; dr = alloc_dr(release, size, gfp | __GFP_ZERO, nid); if (unlikely(!dr)) return NULL; set_node_dbginfo(&dr->node, name, size); return dr->data; } EXPORT_SYMBOL_GPL(__devres_alloc_node); /** * devres_for_each_res - Resource iterator * @dev: Device to iterate resource from * @release: Look for resources associated with this release function * @match: Match function (optional) * @match_data: Data for the match function * @fn: Function to be called for each matched resource. * @data: Data for @fn, the 3rd parameter of @fn * * Call @fn for each devres of @dev which is associated with @release * and for which @match returns 1. * * RETURNS: * void */ void devres_for_each_res(struct device *dev, dr_release_t release, dr_match_t match, void *match_data, void (*fn)(struct device *, void *, void *), void *data) { struct devres_node *node; struct devres_node *tmp; unsigned long flags; if (!fn) return; spin_lock_irqsave(&dev->devres_lock, flags); list_for_each_entry_safe_reverse(node, tmp, &dev->devres_head, entry) { struct devres *dr = container_of(node, struct devres, node); if (node->release != release) continue; if (match && !match(dev, dr->data, match_data)) continue; fn(dev, dr->data, data); } spin_unlock_irqrestore(&dev->devres_lock, flags); } EXPORT_SYMBOL_GPL(devres_for_each_res); /** * devres_free - Free device resource data * @res: Pointer to devres data to free * * Free devres created with devres_alloc(). */ void devres_free(void *res) { if (res) { struct devres *dr = container_of(res, struct devres, data); BUG_ON(!list_empty(&dr->node.entry)); kfree(dr); } } EXPORT_SYMBOL_GPL(devres_free); /** * devres_add - Register device resource * @dev: Device to add resource to * @res: Resource to register * * Register devres @res to @dev. @res should have been allocated * using devres_alloc(). On driver detach, the associated release * function will be invoked and devres will be freed automatically. */ void devres_add(struct device *dev, void *res) { struct devres *dr = container_of(res, struct devres, data); unsigned long flags; spin_lock_irqsave(&dev->devres_lock, flags); add_dr(dev, &dr->node); spin_unlock_irqrestore(&dev->devres_lock, flags); } EXPORT_SYMBOL_GPL(devres_add); static struct devres *find_dr(struct device *dev, dr_release_t release, dr_match_t match, void *match_data) { struct devres_node *node; list_for_each_entry_reverse(node, &dev->devres_head, entry) { struct devres *dr = container_of(node, struct devres, node); if (node->release != release) continue; if (match && !match(dev, dr->data, match_data)) continue; return dr; } return NULL; } /** * devres_find - Find device resource * @dev: Device to lookup resource from * @release: Look for resources associated with this release function * @match: Match function (optional) * @match_data: Data for the match function * * Find the latest devres of @dev which is associated with @release * and for which @match returns 1. If @match is NULL, it's considered * to match all. * * RETURNS: * Pointer to found devres, NULL if not found. */ void * devres_find(struct device *dev, dr_release_t release, dr_match_t match, void *match_data) { struct devres *dr; unsigned long flags; spin_lock_irqsave(&dev->devres_lock, flags); dr = find_dr(dev, release, match, match_data); spin_unlock_irqrestore(&dev->devres_lock, flags); if (dr) return dr->data; return NULL; } EXPORT_SYMBOL_GPL(devres_find); /** * devres_get - Find devres, if non-existent, add one atomically * @dev: Device to lookup or add devres for * @new_res: Pointer to new initialized devres to add if not found * @match: Match function (optional) * @match_data: Data for the match function * * Find the latest devres of @dev which has the same release function * as @new_res and for which @match return 1. If found, @new_res is * freed; otherwise, @new_res is added atomically. * * RETURNS: * Pointer to found or added devres. */ void * devres_get(struct device *dev, void *new_res, dr_match_t match, void *match_data) { struct devres *new_dr = container_of(new_res, struct devres, data); struct devres *dr; unsigned long flags; spin_lock_irqsave(&dev->devres_lock, flags); dr = find_dr(dev, new_dr->node.release, match, match_data); if (!dr) { add_dr(dev, &new_dr->node); dr = new_dr; new_res = NULL; } spin_unlock_irqrestore(&dev->devres_lock, flags); devres_free(new_res); return dr->data; } EXPORT_SYMBOL_GPL(devres_get); /** * devres_remove - Find a device resource and remove it * @dev: Device to find resource from * @release: Look for resources associated with this release function * @match: Match function (optional) * @match_data: Data for the match function * * Find the latest devres of @dev associated with @release and for * which @match returns 1. If @match is NULL, it's considered to * match all. If found, the resource is removed atomically and * returned. * * RETURNS: * Pointer to removed devres on success, NULL if not found. */ void * devres_remove(struct device *dev, dr_release_t release, dr_match_t match, void *match_data) { struct devres *dr; unsigned long flags; spin_lock_irqsave(&dev->devres_lock, flags); dr = find_dr(dev, release, match, match_data); if (dr) { list_del_init(&dr->node.entry); devres_log(dev, &dr->node, "REM"); } spin_unlock_irqrestore(&dev->devres_lock, flags); if (dr) return dr->data; return NULL; } EXPORT_SYMBOL_GPL(devres_remove); /** * devres_destroy - Find a device resource and destroy it * @dev: Device to find resource from * @release: Look for resources associated with this release function * @match: Match function (optional) * @match_data: Data for the match function * * Find the latest devres of @dev associated with @release and for * which @match returns 1. If @match is NULL, it's considered to * match all. If found, the resource is removed atomically and freed. * * Note that the release function for the resource will not be called, * only the devres-allocated data will be freed. The caller becomes * responsible for freeing any other data. * * RETURNS: * 0 if devres is found and freed, -ENOENT if not found. */ int devres_destroy(struct device *dev, dr_release_t release, dr_match_t match, void *match_data) { void *res; res = devres_remove(dev, release, match, match_data); if (unlikely(!res)) return -ENOENT; devres_free(res); return 0; } EXPORT_SYMBOL_GPL(devres_destroy); /** * devres_release - Find a device resource and destroy it, calling release * @dev: Device to find resource from * @release: Look for resources associated with this release function * @match: Match function (optional) * @match_data: Data for the match function * * Find the latest devres of @dev associated with @release and for * which @match returns 1. If @match is NULL, it's considered to * match all. If found, the resource is removed atomically, the * release function called and the resource freed. * * RETURNS: * 0 if devres is found and freed, -ENOENT if not found. */ int devres_release(struct device *dev, dr_release_t release, dr_match_t match, void *match_data) { void *res; res = devres_remove(dev, release, match, match_data); if (unlikely(!res)) return -ENOENT; (*release)(dev, res); devres_free(res); return 0; } EXPORT_SYMBOL_GPL(devres_release); static int remove_nodes(struct device *dev, struct list_head *first, struct list_head *end, struct list_head *todo) { struct devres_node *node, *n; int cnt = 0, nr_groups = 0; /* First pass - move normal devres entries to @todo and clear * devres_group colors. */ node = list_entry(first, struct devres_node, entry); list_for_each_entry_safe_from(node, n, end, entry) { struct devres_group *grp; grp = node_to_group(node); if (grp) { /* clear color of group markers in the first pass */ grp->color = 0; nr_groups++; } else { /* regular devres entry */ if (&node->entry == first) first = first->next; list_move_tail(&node->entry, todo); cnt++; } } if (!nr_groups) return cnt; /* Second pass - Scan groups and color them. A group gets * color value of two iff the group is wholly contained in * [current node, end). That is, for a closed group, both opening * and closing markers should be in the range, while just the * opening marker is enough for an open group. */ node = list_entry(first, struct devres_node, entry); list_for_each_entry_safe_from(node, n, end, entry) { struct devres_group *grp; grp = node_to_group(node); BUG_ON(!grp || list_empty(&grp->node[0].entry)); grp->color++; if (list_empty(&grp->node[1].entry)) grp->color++; BUG_ON(grp->color <= 0 || grp->color > 2); if (grp->color == 2) { /* No need to update current node or end. The removed * nodes are always before both. */ list_move_tail(&grp->node[0].entry, todo); list_del_init(&grp->node[1].entry); } } return cnt; } static void release_nodes(struct device *dev, struct list_head *todo) { struct devres *dr, *tmp; /* Release. Note that both devres and devres_group are * handled as devres in the following loop. This is safe. */ list_for_each_entry_safe_reverse(dr, tmp, todo, node.entry) { devres_log(dev, &dr->node, "REL"); dr->node.release(dev, dr->data); kfree(dr); } } /** * devres_release_all - Release all managed resources * @dev: Device to release resources for * * Release all resources associated with @dev. This function is * called on driver detach. */ int devres_release_all(struct device *dev) { unsigned long flags; LIST_HEAD(todo); int cnt; /* Looks like an uninitialized device structure */ if (WARN_ON(dev->devres_head.next == NULL)) return -ENODEV; /* Nothing to release if list is empty */ if (list_empty(&dev->devres_head)) return 0; spin_lock_irqsave(&dev->devres_lock, flags); cnt = remove_nodes(dev, dev->devres_head.next, &dev->devres_head, &todo); spin_unlock_irqrestore(&dev->devres_lock, flags); release_nodes(dev, &todo); return cnt; } /** * devres_open_group - Open a new devres group * @dev: Device to open devres group for * @id: Separator ID * @gfp: Allocation flags * * Open a new devres group for @dev with @id. For @id, using a * pointer to an object which won't be used for another group is * recommended. If @id is NULL, address-wise unique ID is created. * * RETURNS: * ID of the new group, NULL on failure. */ void * devres_open_group(struct device *dev, void *id, gfp_t gfp) { struct devres_group *grp; unsigned long flags; grp = kmalloc(sizeof(*grp), gfp); if (unlikely(!grp)) return NULL; grp->node[0].release = &group_open_release; grp->node[1].release = &group_close_release; INIT_LIST_HEAD(&grp->node[0].entry); INIT_LIST_HEAD(&grp->node[1].entry); set_node_dbginfo(&grp->node[0], "grp<", 0); set_node_dbginfo(&grp->node[1], "grp>", 0); grp->id = grp; if (id) grp->id = id; grp->color = 0; spin_lock_irqsave(&dev->devres_lock, flags); add_dr(dev, &grp->node[0]); spin_unlock_irqrestore(&dev->devres_lock, flags); return grp->id; } EXPORT_SYMBOL_GPL(devres_open_group); /* Find devres group with ID @id. If @id is NULL, look for the latest. */ static struct devres_group * find_group(struct device *dev, void *id) { struct devres_node *node; list_for_each_entry_reverse(node, &dev->devres_head, entry) { struct devres_group *grp; if (node->release != &group_open_release) continue; grp = container_of(node, struct devres_group, node[0]); if (id) { if (grp->id == id) return grp; } else if (list_empty(&grp->node[1].entry)) return grp; } return NULL; } /** * devres_close_group - Close a devres group * @dev: Device to close devres group for * @id: ID of target group, can be NULL * * Close the group identified by @id. If @id is NULL, the latest open * group is selected. */ void devres_close_group(struct device *dev, void *id) { struct devres_group *grp; unsigned long flags; spin_lock_irqsave(&dev->devres_lock, flags); grp = find_group(dev, id); if (grp) add_dr(dev, &grp->node[1]); else WARN_ON(1); spin_unlock_irqrestore(&dev->devres_lock, flags); } EXPORT_SYMBOL_GPL(devres_close_group); /** * devres_remove_group - Remove a devres group * @dev: Device to remove group for * @id: ID of target group, can be NULL * * Remove the group identified by @id. If @id is NULL, the latest * open group is selected. Note that removing a group doesn't affect * any other resources. */ void devres_remove_group(struct device *dev, void *id) { struct devres_group *grp; unsigned long flags; spin_lock_irqsave(&dev->devres_lock, flags); grp = find_group(dev, id); if (grp) { list_del_init(&grp->node[0].entry); list_del_init(&grp->node[1].entry); devres_log(dev, &grp->node[0], "REM"); } else WARN_ON(1); spin_unlock_irqrestore(&dev->devres_lock, flags); kfree(grp); } EXPORT_SYMBOL_GPL(devres_remove_group); /** * devres_release_group - Release resources in a devres group * @dev: Device to release group for * @id: ID of target group, can be NULL * * Release all resources in the group identified by @id. If @id is * NULL, the latest open group is selected. The selected group and * groups properly nested inside the selected group are removed. * * RETURNS: * The number of released non-group resources. */ int devres_release_group(struct device *dev, void *id) { struct devres_group *grp; unsigned long flags; LIST_HEAD(todo); int cnt = 0; spin_lock_irqsave(&dev->devres_lock, flags); grp = find_group(dev, id); if (grp) { struct list_head *first = &grp->node[0].entry; struct list_head *end = &dev->devres_head; if (!list_empty(&grp->node[1].entry)) end = grp->node[1].entry.next; cnt = remove_nodes(dev, first, end, &todo); spin_unlock_irqrestore(&dev->devres_lock, flags); release_nodes(dev, &todo); } else { WARN_ON(1); spin_unlock_irqrestore(&dev->devres_lock, flags); } return cnt; } EXPORT_SYMBOL_GPL(devres_release_group); /* * Custom devres actions allow inserting a simple function call * into the teardown sequence. */ struct action_devres { void *data; void (*action)(void *); }; static int devm_action_match(struct device *dev, void *res, void *p) { struct action_devres *devres = res; struct action_devres *target = p; return devres->action == target->action && devres->data == target->data; } static void devm_action_release(struct device *dev, void *res) { struct action_devres *devres = res; devres->action(devres->data); } /** * devm_add_action() - add a custom action to list of managed resources * @dev: Device that owns the action * @action: Function that should be called * @data: Pointer to data passed to @action implementation * * This adds a custom action to the list of managed resources so that * it gets executed as part of standard resource unwinding. */ int devm_add_action(struct device *dev, void (*action)(void *), void *data) { struct action_devres *devres; devres = devres_alloc(devm_action_release, sizeof(struct action_devres), GFP_KERNEL); if (!devres) return -ENOMEM; devres->data = data; devres->action = action; devres_add(dev, devres); return 0; } EXPORT_SYMBOL_GPL(devm_add_action); /** * devm_remove_action() - removes previously added custom action * @dev: Device that owns the action * @action: Function implementing the action * @data: Pointer to data passed to @action implementation * * Removes instance of @action previously added by devm_add_action(). * Both action and data should match one of the existing entries. */ void devm_remove_action(struct device *dev, void (*action)(void *), void *data) { struct action_devres devres = { .data = data, .action = action, }; WARN_ON(devres_destroy(dev, devm_action_release, devm_action_match, &devres)); } EXPORT_SYMBOL_GPL(devm_remove_action); /** * devm_release_action() - release previously added custom action * @dev: Device that owns the action * @action: Function implementing the action * @data: Pointer to data passed to @action implementation * * Releases and removes instance of @action previously added by * devm_add_action(). Both action and data should match one of the * existing entries. */ void devm_release_action(struct device *dev, void (*action)(void *), void *data) { struct action_devres devres = { .data = data, .action = action, }; WARN_ON(devres_release(dev, devm_action_release, devm_action_match, &devres)); } EXPORT_SYMBOL_GPL(devm_release_action); /* * Managed kmalloc/kfree */ static void devm_kmalloc_release(struct device *dev, void *res) { /* noop */ } static int devm_kmalloc_match(struct device *dev, void *res, void *data) { return res == data; } /** * devm_kmalloc - Resource-managed kmalloc * @dev: Device to allocate memory for * @size: Allocation size * @gfp: Allocation gfp flags * * Managed kmalloc. Memory allocated with this function is * automatically freed on driver detach. Like all other devres * resources, guaranteed alignment is unsigned long long. * * RETURNS: * Pointer to allocated memory on success, NULL on failure. */ void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp) { struct devres *dr; if (unlikely(!size)) return ZERO_SIZE_PTR; /* use raw alloc_dr for kmalloc caller tracing */ dr = alloc_dr(devm_kmalloc_release, size, gfp, dev_to_node(dev)); if (unlikely(!dr)) return NULL; /* * This is named devm_kzalloc_release for historical reasons * The initial implementation did not support kmalloc, only kzalloc */ set_node_dbginfo(&dr->node, "devm_kzalloc_release", size); devres_add(dev, dr->data); return dr->data; } EXPORT_SYMBOL_GPL(devm_kmalloc); /** * devm_krealloc - Resource-managed krealloc() * @dev: Device to re-allocate memory for * @ptr: Pointer to the memory chunk to re-allocate * @new_size: New allocation size * @gfp: Allocation gfp flags * * Managed krealloc(). Resizes the memory chunk allocated with devm_kmalloc(). * Behaves similarly to regular krealloc(): if @ptr is NULL or ZERO_SIZE_PTR, * it's the equivalent of devm_kmalloc(). If new_size is zero, it frees the * previously allocated memory and returns ZERO_SIZE_PTR. This function doesn't * change the order in which the release callback for the re-alloc'ed devres * will be called (except when falling back to devm_kmalloc() or when freeing * resources when new_size is zero). The contents of the memory are preserved * up to the lesser of new and old sizes. */ void *devm_krealloc(struct device *dev, void *ptr, size_t new_size, gfp_t gfp) { size_t total_new_size, total_old_size; struct devres *old_dr, *new_dr; unsigned long flags; if (unlikely(!new_size)) { devm_kfree(dev, ptr); return ZERO_SIZE_PTR; } if (unlikely(ZERO_OR_NULL_PTR(ptr))) return devm_kmalloc(dev, new_size, gfp); if (WARN_ON(is_kernel_rodata((unsigned long)ptr))) /* * We cannot reliably realloc a const string returned by * devm_kstrdup_const(). */ return NULL; if (!check_dr_size(new_size, &total_new_size)) return NULL; total_old_size = ksize(container_of(ptr, struct devres, data)); if (total_old_size == 0) { WARN(1, "Pointer doesn't point to dynamically allocated memory."); return NULL; } /* * If new size is smaller or equal to the actual number of bytes * allocated previously - just return the same pointer. */ if (total_new_size <= total_old_size) return ptr; /* * Otherwise: allocate new, larger chunk. We need to allocate before * taking the lock as most probably the caller uses GFP_KERNEL. * alloc_dr() will call check_dr_size() to reserve extra memory * for struct devres automatically, so size @new_size user request * is delivered to it directly as devm_kmalloc() does. */ new_dr = alloc_dr(devm_kmalloc_release, new_size, gfp, dev_to_node(dev)); if (!new_dr) return NULL; /* * The spinlock protects the linked list against concurrent * modifications but not the resource itself. */ spin_lock_irqsave(&dev->devres_lock, flags); old_dr = find_dr(dev, devm_kmalloc_release, devm_kmalloc_match, ptr); if (!old_dr) { spin_unlock_irqrestore(&dev->devres_lock, flags); kfree(new_dr); WARN(1, "Memory chunk not managed or managed by a different device."); return NULL; } replace_dr(dev, &old_dr->node, &new_dr->node); spin_unlock_irqrestore(&dev->devres_lock, flags); /* * We can copy the memory contents after releasing the lock as we're * no longer modifying the list links. */ memcpy(new_dr->data, old_dr->data, total_old_size - offsetof(struct devres, data)); /* * Same for releasing the old devres - it's now been removed from the * list. This is also the reason why we must not use devm_kfree() - the * links are no longer valid. */ kfree(old_dr); return new_dr->data; } EXPORT_SYMBOL_GPL(devm_krealloc); /** * devm_kstrdup - Allocate resource managed space and * copy an existing string into that. * @dev: Device to allocate memory for * @s: the string to duplicate * @gfp: the GFP mask used in the devm_kmalloc() call when * allocating memory * RETURNS: * Pointer to allocated string on success, NULL on failure. */ char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp) { size_t size; char *buf; if (!s) return NULL; size = strlen(s) + 1; buf = devm_kmalloc(dev, size, gfp); if (buf) memcpy(buf, s, size); return buf; } EXPORT_SYMBOL_GPL(devm_kstrdup); /** * devm_kstrdup_const - resource managed conditional string duplication * @dev: device for which to duplicate the string * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Strings allocated by devm_kstrdup_const will be automatically freed when * the associated device is detached. * * RETURNS: * Source string if it is in .rodata section otherwise it falls back to * devm_kstrdup. */ const char *devm_kstrdup_const(struct device *dev, const char *s, gfp_t gfp) { if (is_kernel_rodata((unsigned long)s)) return s; return devm_kstrdup(dev, s, gfp); } EXPORT_SYMBOL_GPL(devm_kstrdup_const); /** * devm_kvasprintf - Allocate resource managed space and format a string * into that. * @dev: Device to allocate memory for * @gfp: the GFP mask used in the devm_kmalloc() call when * allocating memory * @fmt: The printf()-style format string * @ap: Arguments for the format string * RETURNS: * Pointer to allocated string on success, NULL on failure. */ char *devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt, va_list ap) { unsigned int len; char *p; va_list aq; va_copy(aq, ap); len = vsnprintf(NULL, 0, fmt, aq); va_end(aq); p = devm_kmalloc(dev, len+1, gfp); if (!p) return NULL; vsnprintf(p, len+1, fmt, ap); return p; } EXPORT_SYMBOL(devm_kvasprintf); /** * devm_kasprintf - Allocate resource managed space and format a string * into that. * @dev: Device to allocate memory for * @gfp: the GFP mask used in the devm_kmalloc() call when * allocating memory * @fmt: The printf()-style format string * @...: Arguments for the format string * RETURNS: * Pointer to allocated string on success, NULL on failure. */ char *devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...) { va_list ap; char *p; va_start(ap, fmt); p = devm_kvasprintf(dev, gfp, fmt, ap); va_end(ap); return p; } EXPORT_SYMBOL_GPL(devm_kasprintf); /** * devm_kfree - Resource-managed kfree * @dev: Device this memory belongs to * @p: Memory to free * * Free memory allocated with devm_kmalloc(). */ void devm_kfree(struct device *dev, const void *p) { int rc; /* * Special cases: pointer to a string in .rodata returned by * devm_kstrdup_const() or NULL/ZERO ptr. */ if (unlikely(is_kernel_rodata((unsigned long)p) || ZERO_OR_NULL_PTR(p))) return; rc = devres_destroy(dev, devm_kmalloc_release, devm_kmalloc_match, (void *)p); WARN_ON(rc); } EXPORT_SYMBOL_GPL(devm_kfree); /** * devm_kmemdup - Resource-managed kmemdup * @dev: Device this memory belongs to * @src: Memory region to duplicate * @len: Memory region length * @gfp: GFP mask to use * * Duplicate region of a memory using resource managed kmalloc */ void *devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp) { void *p; p = devm_kmalloc(dev, len, gfp); if (p) memcpy(p, src, len); return p; } EXPORT_SYMBOL_GPL(devm_kmemdup); struct pages_devres { unsigned long addr; unsigned int order; }; static int devm_pages_match(struct device *dev, void *res, void *p) { struct pages_devres *devres = res; struct pages_devres *target = p; return devres->addr == target->addr; } static void devm_pages_release(struct device *dev, void *res) { struct pages_devres *devres = res; free_pages(devres->addr, devres->order); } /** * devm_get_free_pages - Resource-managed __get_free_pages * @dev: Device to allocate memory for * @gfp_mask: Allocation gfp flags * @order: Allocation size is (1 << order) pages * * Managed get_free_pages. Memory allocated with this function is * automatically freed on driver detach. * * RETURNS: * Address of allocated memory on success, 0 on failure. */ unsigned long devm_get_free_pages(struct device *dev, gfp_t gfp_mask, unsigned int order) { struct pages_devres *devres; unsigned long addr; addr = __get_free_pages(gfp_mask, order); if (unlikely(!addr)) return 0; devres = devres_alloc(devm_pages_release, sizeof(struct pages_devres), GFP_KERNEL); if (unlikely(!devres)) { free_pages(addr, order); return 0; } devres->addr = addr; devres->order = order; devres_add(dev, devres); return addr; } EXPORT_SYMBOL_GPL(devm_get_free_pages); /** * devm_free_pages - Resource-managed free_pages * @dev: Device this memory belongs to * @addr: Memory to free * * Free memory allocated with devm_get_free_pages(). Unlike free_pages, * there is no need to supply the @order. */ void devm_free_pages(struct device *dev, unsigned long addr) { struct pages_devres devres = { .addr = addr }; WARN_ON(devres_release(dev, devm_pages_release, devm_pages_match, &devres)); } EXPORT_SYMBOL_GPL(devm_free_pages); static void devm_percpu_release(struct device *dev, void *pdata) { void __percpu *p; p = *(void __percpu **)pdata; free_percpu(p); } static int devm_percpu_match(struct device *dev, void *data, void *p) { struct devres *devr = container_of(data, struct devres, data); return *(void **)devr->data == p; } /** * __devm_alloc_percpu - Resource-managed alloc_percpu * @dev: Device to allocate per-cpu memory for * @size: Size of per-cpu memory to allocate * @align: Alignment of per-cpu memory to allocate * * Managed alloc_percpu. Per-cpu memory allocated with this function is * automatically freed on driver detach. * * RETURNS: * Pointer to allocated memory on success, NULL on failure. */ void __percpu *__devm_alloc_percpu(struct device *dev, size_t size, size_t align) { void *p; void __percpu *pcpu; pcpu = __alloc_percpu(size, align); if (!pcpu) return NULL; p = devres_alloc(devm_percpu_release, sizeof(void *), GFP_KERNEL); if (!p) { free_percpu(pcpu); return NULL; } *(void __percpu **)p = pcpu; devres_add(dev, p); return pcpu; } EXPORT_SYMBOL_GPL(__devm_alloc_percpu); /** * devm_free_percpu - Resource-managed free_percpu * @dev: Device this memory belongs to * @pdata: Per-cpu memory to free * * Free memory allocated with devm_alloc_percpu(). */ void devm_free_percpu(struct device *dev, void __percpu *pdata) { /* * Use devres_release() to prevent memory leakage as * devm_free_pages() does. */ WARN_ON(devres_release(dev, devm_percpu_release, devm_percpu_match, (__force void *)pdata)); } EXPORT_SYMBOL_GPL(devm_free_percpu); |
3 2 3 3 4 4 1 3 4 2 2 1 6 6 3 3 6 6 2 170 168 2 2 170 2 4 4 4 4 4 4 4 4 1 1 6 6 146 146 146 170 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 | // SPDX-License-Identifier: GPL-2.0-only /* * HID raw devices, giving access to raw HID events. * * In comparison to hiddev, this device does not process the * hid events at all (no parsing, no lookups). This lets applications * to work on raw hid events as they want to, and avoids a need to * use a transport-specific userspace libhid/libusb libraries. * * Copyright (c) 2007-2014 Jiri Kosina */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/fs.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/cdev.h> #include <linux/poll.h> #include <linux/device.h> #include <linux/major.h> #include <linux/slab.h> #include <linux/hid.h> #include <linux/mutex.h> #include <linux/sched/signal.h> #include <linux/string.h> #include <linux/hidraw.h> static int hidraw_major; static struct cdev hidraw_cdev; static struct class *hidraw_class; static struct hidraw *hidraw_table[HIDRAW_MAX_DEVICES]; static DECLARE_RWSEM(minors_rwsem); static ssize_t hidraw_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) { struct hidraw_list *list = file->private_data; int ret = 0, len; DECLARE_WAITQUEUE(wait, current); mutex_lock(&list->read_mutex); while (ret == 0) { if (list->head == list->tail) { add_wait_queue(&list->hidraw->wait, &wait); set_current_state(TASK_INTERRUPTIBLE); while (list->head == list->tail) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; } if (!list->hidraw->exist) { ret = -EIO; break; } if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; break; } /* allow O_NONBLOCK to work well from other threads */ mutex_unlock(&list->read_mutex); schedule(); mutex_lock(&list->read_mutex); set_current_state(TASK_INTERRUPTIBLE); } set_current_state(TASK_RUNNING); remove_wait_queue(&list->hidraw->wait, &wait); } if (ret) goto out; len = list->buffer[list->tail].len > count ? count : list->buffer[list->tail].len; if (list->buffer[list->tail].value) { if (copy_to_user(buffer, list->buffer[list->tail].value, len)) { ret = -EFAULT; goto out; } ret = len; } kfree(list->buffer[list->tail].value); list->buffer[list->tail].value = NULL; list->tail = (list->tail + 1) & (HIDRAW_BUFFER_SIZE - 1); } out: mutex_unlock(&list->read_mutex); return ret; } /* * The first byte of the report buffer is expected to be a report number. */ static ssize_t hidraw_send_report(struct file *file, const char __user *buffer, size_t count, unsigned char report_type) { unsigned int minor = iminor(file_inode(file)); struct hid_device *dev; __u8 *buf; int ret = 0; lockdep_assert_held(&minors_rwsem); if (!hidraw_table[minor] || !hidraw_table[minor]->exist) { ret = -ENODEV; goto out; } dev = hidraw_table[minor]->hid; if (count > HID_MAX_BUFFER_SIZE) { hid_warn(dev, "pid %d passed too large report\n", task_pid_nr(current)); ret = -EINVAL; goto out; } if (count < 2) { hid_warn(dev, "pid %d passed too short report\n", task_pid_nr(current)); ret = -EINVAL; goto out; } buf = memdup_user(buffer, count); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto out; } if ((report_type == HID_OUTPUT_REPORT) && !(dev->quirks & HID_QUIRK_NO_OUTPUT_REPORTS_ON_INTR_EP)) { ret = hid_hw_output_report(dev, buf, count); /* * compatibility with old implementation of USB-HID and I2C-HID: * if the device does not support receiving output reports, * on an interrupt endpoint, fallback to SET_REPORT HID command. */ if (ret != -ENOSYS) goto out_free; } ret = hid_hw_raw_request(dev, buf[0], buf, count, report_type, HID_REQ_SET_REPORT); out_free: kfree(buf); out: return ret; } static ssize_t hidraw_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { ssize_t ret; down_read(&minors_rwsem); ret = hidraw_send_report(file, buffer, count, HID_OUTPUT_REPORT); up_read(&minors_rwsem); return ret; } /* * This function performs a Get_Report transfer over the control endpoint * per section 7.2.1 of the HID specification, version 1.1. The first byte * of buffer is the report number to request, or 0x0 if the device does not * use numbered reports. The report_type parameter can be HID_FEATURE_REPORT * or HID_INPUT_REPORT. */ static ssize_t hidraw_get_report(struct file *file, char __user *buffer, size_t count, unsigned char report_type) { unsigned int minor = iminor(file_inode(file)); struct hid_device *dev; __u8 *buf; int ret = 0, len; unsigned char report_number; lockdep_assert_held(&minors_rwsem); if (!hidraw_table[minor] || !hidraw_table[minor]->exist) { ret = -ENODEV; goto out; } dev = hidraw_table[minor]->hid; if (!dev->ll_driver->raw_request) { ret = -ENODEV; goto out; } if (count > HID_MAX_BUFFER_SIZE) { hid_warn(dev, "pid %d passed too large report\n", task_pid_nr(current)); ret = -EINVAL; goto out; } if (count < 2) { hid_warn(dev, "pid %d passed too short report\n", task_pid_nr(current)); ret = -EINVAL; goto out; } buf = kmalloc(count, GFP_KERNEL); if (!buf) { ret = -ENOMEM; goto out; } /* * Read the first byte from the user. This is the report number, * which is passed to hid_hw_raw_request(). */ if (copy_from_user(&report_number, buffer, 1)) { ret = -EFAULT; goto out_free; } ret = hid_hw_raw_request(dev, report_number, buf, count, report_type, HID_REQ_GET_REPORT); if (ret < 0) goto out_free; len = (ret < count) ? ret : count; if (copy_to_user(buffer, buf, len)) { ret = -EFAULT; goto out_free; } ret = len; out_free: kfree(buf); out: return ret; } static __poll_t hidraw_poll(struct file *file, poll_table *wait) { struct hidraw_list *list = file->private_data; __poll_t mask = EPOLLOUT | EPOLLWRNORM; /* hidraw is always writable */ poll_wait(file, &list->hidraw->wait, wait); if (list->head != list->tail) mask |= EPOLLIN | EPOLLRDNORM; if (!list->hidraw->exist) mask |= EPOLLERR | EPOLLHUP; return mask; } static int hidraw_open(struct inode *inode, struct file *file) { unsigned int minor = iminor(inode); struct hidraw *dev; struct hidraw_list *list; unsigned long flags; int err = 0; if (!(list = kzalloc(sizeof(struct hidraw_list), GFP_KERNEL))) { err = -ENOMEM; goto out; } /* * Technically not writing to the hidraw_table but a write lock is * required to protect the device refcount. This is symmetrical to * hidraw_release(). */ down_write(&minors_rwsem); if (!hidraw_table[minor] || !hidraw_table[minor]->exist) { err = -ENODEV; goto out_unlock; } dev = hidraw_table[minor]; if (!dev->open++) { err = hid_hw_power(dev->hid, PM_HINT_FULLON); if (err < 0) { dev->open--; goto out_unlock; } err = hid_hw_open(dev->hid); if (err < 0) { hid_hw_power(dev->hid, PM_HINT_NORMAL); dev->open--; goto out_unlock; } } list->hidraw = hidraw_table[minor]; mutex_init(&list->read_mutex); spin_lock_irqsave(&hidraw_table[minor]->list_lock, flags); list_add_tail(&list->node, &hidraw_table[minor]->list); spin_unlock_irqrestore(&hidraw_table[minor]->list_lock, flags); file->private_data = list; out_unlock: up_write(&minors_rwsem); out: if (err < 0) kfree(list); return err; } static int hidraw_fasync(int fd, struct file *file, int on) { struct hidraw_list *list = file->private_data; return fasync_helper(fd, file, on, &list->fasync); } static void drop_ref(struct hidraw *hidraw, int exists_bit) { if (exists_bit) { hidraw->exist = 0; if (hidraw->open) { hid_hw_close(hidraw->hid); wake_up_interruptible(&hidraw->wait); } device_destroy(hidraw_class, MKDEV(hidraw_major, hidraw->minor)); } else { --hidraw->open; } if (!hidraw->open) { if (!hidraw->exist) { hidraw_table[hidraw->minor] = NULL; kfree(hidraw); } else { /* close device for last reader */ hid_hw_close(hidraw->hid); hid_hw_power(hidraw->hid, PM_HINT_NORMAL); } } } static int hidraw_release(struct inode * inode, struct file * file) { unsigned int minor = iminor(inode); struct hidraw_list *list = file->private_data; unsigned long flags; down_write(&minors_rwsem); spin_lock_irqsave(&hidraw_table[minor]->list_lock, flags); while (list->tail != list->head) { kfree(list->buffer[list->tail].value); list->buffer[list->tail].value = NULL; list->tail = (list->tail + 1) & (HIDRAW_BUFFER_SIZE - 1); } list_del(&list->node); spin_unlock_irqrestore(&hidraw_table[minor]->list_lock, flags); kfree(list); drop_ref(hidraw_table[minor], 0); up_write(&minors_rwsem); return 0; } static long hidraw_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(file); unsigned int minor = iminor(inode); long ret = 0; struct hidraw *dev; void __user *user_arg = (void __user*) arg; down_read(&minors_rwsem); dev = hidraw_table[minor]; if (!dev || !dev->exist) { ret = -ENODEV; goto out; } switch (cmd) { case HIDIOCGRDESCSIZE: if (put_user(dev->hid->rsize, (int __user *)arg)) ret = -EFAULT; break; case HIDIOCGRDESC: { __u32 len; if (get_user(len, (int __user *)arg)) ret = -EFAULT; else if (len > HID_MAX_DESCRIPTOR_SIZE - 1) ret = -EINVAL; else if (copy_to_user(user_arg + offsetof( struct hidraw_report_descriptor, value[0]), dev->hid->rdesc, min(dev->hid->rsize, len))) ret = -EFAULT; break; } case HIDIOCGRAWINFO: { struct hidraw_devinfo dinfo; dinfo.bustype = dev->hid->bus; dinfo.vendor = dev->hid->vendor; dinfo.product = dev->hid->product; if (copy_to_user(user_arg, &dinfo, sizeof(dinfo))) ret = -EFAULT; break; } default: { struct hid_device *hid = dev->hid; if (_IOC_TYPE(cmd) != 'H') { ret = -EINVAL; break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCSFEATURE(0))) { int len = _IOC_SIZE(cmd); ret = hidraw_send_report(file, user_arg, len, HID_FEATURE_REPORT); break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGFEATURE(0))) { int len = _IOC_SIZE(cmd); ret = hidraw_get_report(file, user_arg, len, HID_FEATURE_REPORT); break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCSINPUT(0))) { int len = _IOC_SIZE(cmd); ret = hidraw_send_report(file, user_arg, len, HID_INPUT_REPORT); break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGINPUT(0))) { int len = _IOC_SIZE(cmd); ret = hidraw_get_report(file, user_arg, len, HID_INPUT_REPORT); break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCSOUTPUT(0))) { int len = _IOC_SIZE(cmd); ret = hidraw_send_report(file, user_arg, len, HID_OUTPUT_REPORT); break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGOUTPUT(0))) { int len = _IOC_SIZE(cmd); ret = hidraw_get_report(file, user_arg, len, HID_OUTPUT_REPORT); break; } /* Begin Read-only ioctls. */ if (_IOC_DIR(cmd) != _IOC_READ) { ret = -EINVAL; break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGRAWNAME(0))) { int len = strlen(hid->name) + 1; if (len > _IOC_SIZE(cmd)) len = _IOC_SIZE(cmd); ret = copy_to_user(user_arg, hid->name, len) ? -EFAULT : len; break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGRAWPHYS(0))) { int len = strlen(hid->phys) + 1; if (len > _IOC_SIZE(cmd)) len = _IOC_SIZE(cmd); ret = copy_to_user(user_arg, hid->phys, len) ? -EFAULT : len; break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGRAWUNIQ(0))) { int len = strlen(hid->uniq) + 1; if (len > _IOC_SIZE(cmd)) len = _IOC_SIZE(cmd); ret = copy_to_user(user_arg, hid->uniq, len) ? -EFAULT : len; break; } } ret = -ENOTTY; } out: up_read(&minors_rwsem); return ret; } static const struct file_operations hidraw_ops = { .owner = THIS_MODULE, .read = hidraw_read, .write = hidraw_write, .poll = hidraw_poll, .open = hidraw_open, .release = hidraw_release, .unlocked_ioctl = hidraw_ioctl, .fasync = hidraw_fasync, .compat_ioctl = compat_ptr_ioctl, .llseek = noop_llseek, }; int hidraw_report_event(struct hid_device *hid, u8 *data, int len) { struct hidraw *dev = hid->hidraw; struct hidraw_list *list; int ret = 0; unsigned long flags; spin_lock_irqsave(&dev->list_lock, flags); list_for_each_entry(list, &dev->list, node) { int new_head = (list->head + 1) & (HIDRAW_BUFFER_SIZE - 1); if (new_head == list->tail) continue; if (!(list->buffer[list->head].value = kmemdup(data, len, GFP_ATOMIC))) { ret = -ENOMEM; break; } list->buffer[list->head].len = len; list->head = new_head; kill_fasync(&list->fasync, SIGIO, POLL_IN); } spin_unlock_irqrestore(&dev->list_lock, flags); wake_up_interruptible(&dev->wait); return ret; } EXPORT_SYMBOL_GPL(hidraw_report_event); int hidraw_connect(struct hid_device *hid) { int minor, result; struct hidraw *dev; /* we accept any HID device, all applications */ dev = kzalloc(sizeof(struct hidraw), GFP_KERNEL); if (!dev) return -ENOMEM; result = -EINVAL; down_write(&minors_rwsem); for (minor = 0; minor < HIDRAW_MAX_DEVICES; minor++) { if (hidraw_table[minor]) continue; hidraw_table[minor] = dev; result = 0; break; } if (result) { up_write(&minors_rwsem); kfree(dev); goto out; } dev->dev = device_create(hidraw_class, &hid->dev, MKDEV(hidraw_major, minor), NULL, "%s%d", "hidraw", minor); if (IS_ERR(dev->dev)) { hidraw_table[minor] = NULL; up_write(&minors_rwsem); result = PTR_ERR(dev->dev); kfree(dev); goto out; } init_waitqueue_head(&dev->wait); spin_lock_init(&dev->list_lock); INIT_LIST_HEAD(&dev->list); dev->hid = hid; dev->minor = minor; dev->exist = 1; hid->hidraw = dev; up_write(&minors_rwsem); out: return result; } EXPORT_SYMBOL_GPL(hidraw_connect); void hidraw_disconnect(struct hid_device *hid) { struct hidraw *hidraw = hid->hidraw; down_write(&minors_rwsem); drop_ref(hidraw, 1); up_write(&minors_rwsem); } EXPORT_SYMBOL_GPL(hidraw_disconnect); int __init hidraw_init(void) { int result; dev_t dev_id; result = alloc_chrdev_region(&dev_id, HIDRAW_FIRST_MINOR, HIDRAW_MAX_DEVICES, "hidraw"); if (result < 0) { pr_warn("can't get major number\n"); goto out; } hidraw_major = MAJOR(dev_id); hidraw_class = class_create(THIS_MODULE, "hidraw"); if (IS_ERR(hidraw_class)) { result = PTR_ERR(hidraw_class); goto error_cdev; } cdev_init(&hidraw_cdev, &hidraw_ops); result = cdev_add(&hidraw_cdev, dev_id, HIDRAW_MAX_DEVICES); if (result < 0) goto error_class; pr_info("raw HID events driver (C) Jiri Kosina\n"); out: return result; error_class: class_destroy(hidraw_class); error_cdev: unregister_chrdev_region(dev_id, HIDRAW_MAX_DEVICES); goto out; } void hidraw_exit(void) { dev_t dev_id = MKDEV(hidraw_major, 0); cdev_del(&hidraw_cdev); class_destroy(hidraw_class); unregister_chrdev_region(dev_id, HIDRAW_MAX_DEVICES); } |
2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 | /* SPDX-License-Identifier: GPL-2.0 */ /* * DES & Triple DES EDE key verification helpers */ #ifndef __CRYPTO_INTERNAL_DES_H #define __CRYPTO_INTERNAL_DES_H #include <linux/crypto.h> #include <linux/fips.h> #include <crypto/des.h> #include <crypto/aead.h> #include <crypto/skcipher.h> /** * crypto_des_verify_key - Check whether a DES key is weak * @tfm: the crypto algo * @key: the key buffer * * Returns -EINVAL if the key is weak and the crypto TFM does not permit weak * keys. Otherwise, 0 is returned. * * It is the job of the caller to ensure that the size of the key equals * DES_KEY_SIZE. */ static inline int crypto_des_verify_key(struct crypto_tfm *tfm, const u8 *key) { struct des_ctx tmp; int err; err = des_expand_key(&tmp, key, DES_KEY_SIZE); if (err == -ENOKEY) { if (crypto_tfm_get_flags(tfm) & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS) err = -EINVAL; else err = 0; } memzero_explicit(&tmp, sizeof(tmp)); return err; } /* * RFC2451: * * For DES-EDE3, there is no known need to reject weak or * complementation keys. Any weakness is obviated by the use of * multiple keys. * * However, if the first two or last two independent 64-bit keys are * equal (k1 == k2 or k2 == k3), then the DES3 operation is simply the * same as DES. Implementers MUST reject keys that exhibit this * property. * */ static inline int des3_ede_verify_key(const u8 *key, unsigned int key_len, bool check_weak) { int ret = fips_enabled ? -EINVAL : -ENOKEY; u32 K[6]; memcpy(K, key, DES3_EDE_KEY_SIZE); if ((!((K[0] ^ K[2]) | (K[1] ^ K[3])) || !((K[2] ^ K[4]) | (K[3] ^ K[5]))) && (fips_enabled || check_weak)) goto bad; if ((!((K[0] ^ K[4]) | (K[1] ^ K[5]))) && fips_enabled) goto bad; ret = 0; bad: memzero_explicit(K, DES3_EDE_KEY_SIZE); return ret; } /** * crypto_des3_ede_verify_key - Check whether a DES3-EDE key is weak * @tfm: the crypto algo * @key: the key buffer * * Returns -EINVAL if the key is weak and the crypto TFM does not permit weak * keys or when running in FIPS mode. Otherwise, 0 is returned. Note that some * keys are rejected in FIPS mode even if weak keys are permitted by the TFM * flags. * * It is the job of the caller to ensure that the size of the key equals * DES3_EDE_KEY_SIZE. */ static inline int crypto_des3_ede_verify_key(struct crypto_tfm *tfm, const u8 *key) { return des3_ede_verify_key(key, DES3_EDE_KEY_SIZE, crypto_tfm_get_flags(tfm) & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); } static inline int verify_skcipher_des_key(struct crypto_skcipher *tfm, const u8 *key) { return crypto_des_verify_key(crypto_skcipher_tfm(tfm), key); } static inline int verify_skcipher_des3_key(struct crypto_skcipher *tfm, const u8 *key) { return crypto_des3_ede_verify_key(crypto_skcipher_tfm(tfm), key); } static inline int verify_aead_des_key(struct crypto_aead *tfm, const u8 *key, int keylen) { if (keylen != DES_KEY_SIZE) return -EINVAL; return crypto_des_verify_key(crypto_aead_tfm(tfm), key); } static inline int verify_aead_des3_key(struct crypto_aead *tfm, const u8 *key, int keylen) { if (keylen != DES3_EDE_KEY_SIZE) return -EINVAL; return crypto_des3_ede_verify_key(crypto_aead_tfm(tfm), key); } #endif /* __CRYPTO_INTERNAL_DES_H */ |
106 111 22 106 4 1 3 1889 1888 1887 1889 720 1207 1208 24 704 1887 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 | // SPDX-License-Identifier: GPL-2.0 /* * Performance events callchain code, extracted from core.c: * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> */ #include <linux/perf_event.h> #include <linux/slab.h> #include <linux/sched/task_stack.h> #include "internal.h" struct callchain_cpus_entries { struct rcu_head rcu_head; struct perf_callchain_entry *cpu_entries[]; }; int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK; static inline size_t perf_callchain_entry__sizeof(void) { return (sizeof(struct perf_callchain_entry) + sizeof(__u64) * (sysctl_perf_event_max_stack + sysctl_perf_event_max_contexts_per_stack)); } static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); static atomic_t nr_callchain_events; static DEFINE_MUTEX(callchain_mutex); static struct callchain_cpus_entries *callchain_cpus_entries; __weak void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { } __weak void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { } static void release_callchain_buffers_rcu(struct rcu_head *head) { struct callchain_cpus_entries *entries; int cpu; entries = container_of(head, struct callchain_cpus_entries, rcu_head); for_each_possible_cpu(cpu) kfree(entries->cpu_entries[cpu]); kfree(entries); } static void release_callchain_buffers(void) { struct callchain_cpus_entries *entries; entries = callchain_cpus_entries; RCU_INIT_POINTER(callchain_cpus_entries, NULL); call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); } static int alloc_callchain_buffers(void) { int cpu; int size; struct callchain_cpus_entries *entries; /* * We can't use the percpu allocation API for data that can be * accessed from NMI. Use a temporary manual per cpu allocation * until that gets sorted out. */ size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); entries = kzalloc(size, GFP_KERNEL); if (!entries) return -ENOMEM; size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS; for_each_possible_cpu(cpu) { entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); if (!entries->cpu_entries[cpu]) goto fail; } rcu_assign_pointer(callchain_cpus_entries, entries); return 0; fail: for_each_possible_cpu(cpu) kfree(entries->cpu_entries[cpu]); kfree(entries); return -ENOMEM; } int get_callchain_buffers(int event_max_stack) { int err = 0; int count; mutex_lock(&callchain_mutex); count = atomic_inc_return(&nr_callchain_events); if (WARN_ON_ONCE(count < 1)) { err = -EINVAL; goto exit; } /* * If requesting per event more than the global cap, * return a different error to help userspace figure * this out. * * And also do it here so that we have &callchain_mutex held. */ if (event_max_stack > sysctl_perf_event_max_stack) { err = -EOVERFLOW; goto exit; } if (count == 1) err = alloc_callchain_buffers(); exit: if (err) atomic_dec(&nr_callchain_events); mutex_unlock(&callchain_mutex); return err; } void put_callchain_buffers(void) { if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { release_callchain_buffers(); mutex_unlock(&callchain_mutex); } } struct perf_callchain_entry *get_callchain_entry(int *rctx) { int cpu; struct callchain_cpus_entries *entries; *rctx = get_recursion_context(this_cpu_ptr(callchain_recursion)); if (*rctx == -1) return NULL; entries = rcu_dereference(callchain_cpus_entries); if (!entries) { put_recursion_context(this_cpu_ptr(callchain_recursion), *rctx); return NULL; } cpu = smp_processor_id(); return (((void *)entries->cpu_entries[cpu]) + (*rctx * perf_callchain_entry__sizeof())); } void put_callchain_entry(int rctx) { put_recursion_context(this_cpu_ptr(callchain_recursion), rctx); } struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark) { struct perf_callchain_entry *entry; struct perf_callchain_entry_ctx ctx; int rctx; entry = get_callchain_entry(&rctx); if (!entry) return NULL; ctx.entry = entry; ctx.max_stack = max_stack; ctx.nr = entry->nr = init_nr; ctx.contexts = 0; ctx.contexts_maxed = false; if (kernel && !user_mode(regs)) { if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL); perf_callchain_kernel(&ctx, regs); } if (user) { if (!user_mode(regs)) { if (current->mm) regs = task_pt_regs(current); else regs = NULL; } if (regs) { if (crosstask) goto exit_put; if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); perf_callchain_user(&ctx, regs); } } exit_put: put_callchain_entry(rctx); return entry; } /* * Used for sysctl_perf_event_max_stack and * sysctl_perf_event_max_contexts_per_stack. */ int perf_event_max_stack_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *value = table->data; int new_value = *value, ret; struct ctl_table new_table = *table; new_table.data = &new_value; ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos); if (ret || !write) return ret; mutex_lock(&callchain_mutex); if (atomic_read(&nr_callchain_events)) ret = -EBUSY; else *value = new_value; mutex_unlock(&callchain_mutex); return ret; } |
32 2 28 1 1 102 161 97 164 161 137 41 29 7 22 19 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | // SPDX-License-Identifier: GPL-2.0-or-later /* */ #include <linux/init.h> #include <linux/slab.h> #include <linux/usb.h> #include "usbaudio.h" #include "helper.h" #include "quirks.h" /* * combine bytes and get an integer value */ unsigned int snd_usb_combine_bytes(unsigned char *bytes, int size) { switch (size) { case 1: return *bytes; case 2: return combine_word(bytes); case 3: return combine_triple(bytes); case 4: return combine_quad(bytes); default: return 0; } } /* * parse descriptor buffer and return the pointer starting the given * descriptor type. */ void *snd_usb_find_desc(void *descstart, int desclen, void *after, u8 dtype) { u8 *p, *end, *next; p = descstart; end = p + desclen; for (; p < end;) { if (p[0] < 2) return NULL; next = p + p[0]; if (next > end) return NULL; if (p[1] == dtype && (!after || (void *)p > after)) { return p; } p = next; } return NULL; } /* * find a class-specified interface descriptor with the given subtype. */ void *snd_usb_find_csint_desc(void *buffer, int buflen, void *after, u8 dsubtype) { unsigned char *p = after; while ((p = snd_usb_find_desc(buffer, buflen, p, USB_DT_CS_INTERFACE)) != NULL) { if (p[0] >= 3 && p[2] == dsubtype) return p; } return NULL; } /* * Wrapper for usb_control_msg(). * Allocates a temp buffer to prevent dmaing from/to the stack. */ int snd_usb_ctl_msg(struct usb_device *dev, unsigned int pipe, __u8 request, __u8 requesttype, __u16 value, __u16 index, void *data, __u16 size) { int err; void *buf = NULL; int timeout; if (usb_pipe_type_check(dev, pipe)) return -EINVAL; if (size > 0) { buf = kmemdup(data, size, GFP_KERNEL); if (!buf) return -ENOMEM; } if (requesttype & USB_DIR_IN) timeout = USB_CTRL_GET_TIMEOUT; else timeout = USB_CTRL_SET_TIMEOUT; err = usb_control_msg(dev, pipe, request, requesttype, value, index, buf, size, timeout); if (size > 0) { memcpy(data, buf, size); kfree(buf); } snd_usb_ctl_msg_quirk(dev, pipe, request, requesttype, value, index, data, size); return err; } unsigned char snd_usb_parse_datainterval(struct snd_usb_audio *chip, struct usb_host_interface *alts) { switch (snd_usb_get_speed(chip->dev)) { case USB_SPEED_HIGH: case USB_SPEED_WIRELESS: case USB_SPEED_SUPER: case USB_SPEED_SUPER_PLUS: if (get_endpoint(alts, 0)->bInterval >= 1 && get_endpoint(alts, 0)->bInterval <= 4) return get_endpoint(alts, 0)->bInterval - 1; break; default: break; } return 0; } struct usb_host_interface * snd_usb_get_host_interface(struct snd_usb_audio *chip, int ifnum, int altsetting) { struct usb_interface *iface; iface = usb_ifnum_to_if(chip->dev, ifnum); if (!iface) return NULL; return usb_altnum_to_altsetting(iface, altsetting); } |
127 29 42 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 | // SPDX-License-Identifier: GPL-2.0 /* * fs/ext4/extents_status.h * * Written by Yongqiang Yang <xiaoqiangnk@gmail.com> * Modified by * Allison Henderson <achender@linux.vnet.ibm.com> * Zheng Liu <wenqing.lz@taobao.com> * */ #ifndef _EXT4_EXTENTS_STATUS_H #define _EXT4_EXTENTS_STATUS_H /* * Turn on ES_DEBUG__ to get lots of info about extent status operations. */ #ifdef ES_DEBUG__ #define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) #else #define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif /* * With ES_AGGRESSIVE_TEST defined, the result of es caching will be * checked with old map_block's result. */ #define ES_AGGRESSIVE_TEST__ /* * These flags live in the high bits of extent_status.es_pblk */ enum { ES_WRITTEN_B, ES_UNWRITTEN_B, ES_DELAYED_B, ES_HOLE_B, ES_REFERENCED_B, ES_FLAGS }; #define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS) #define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT) #define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B) #define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B) #define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B) #define EXTENT_STATUS_HOLE (1 << ES_HOLE_B) #define EXTENT_STATUS_REFERENCED (1 << ES_REFERENCED_B) #define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \ EXTENT_STATUS_UNWRITTEN | \ EXTENT_STATUS_DELAYED | \ EXTENT_STATUS_HOLE) << ES_SHIFT) struct ext4_sb_info; struct ext4_extent; struct extent_status { struct rb_node rb_node; ext4_lblk_t es_lblk; /* first logical block extent covers */ ext4_lblk_t es_len; /* length of extent in block */ ext4_fsblk_t es_pblk; /* first physical block */ }; struct ext4_es_tree { struct rb_root root; struct extent_status *cache_es; /* recently accessed extent */ }; struct ext4_es_stats { unsigned long es_stats_shrunk; struct percpu_counter es_stats_cache_hits; struct percpu_counter es_stats_cache_misses; u64 es_stats_scan_time; u64 es_stats_max_scan_time; struct percpu_counter es_stats_all_cnt; struct percpu_counter es_stats_shk_cnt; }; /* * Pending cluster reservations for bigalloc file systems * * A cluster with a pending reservation is a logical cluster shared by at * least one extent in the extents status tree with delayed and unwritten * status and at least one other written or unwritten extent. The * reservation is said to be pending because a cluster reservation would * have to be taken in the event all blocks in the cluster shared with * written or unwritten extents were deleted while the delayed and * unwritten blocks remained. * * The set of pending cluster reservations is an auxiliary data structure * used with the extents status tree to implement reserved cluster/block * accounting for bigalloc file systems. The set is kept in memory and * records all pending cluster reservations. * * Its primary function is to avoid the need to read extents from the * disk when invalidating pages as a result of a truncate, punch hole, or * collapse range operation. Page invalidation requires a decrease in the * reserved cluster count if it results in the removal of all delayed * and unwritten extents (blocks) from a cluster that is not shared with a * written or unwritten extent, and no decrease otherwise. Determining * whether the cluster is shared can be done by searching for a pending * reservation on it. * * Secondarily, it provides a potentially faster method for determining * whether the reserved cluster count should be increased when a physical * cluster is deallocated as a result of a truncate, punch hole, or * collapse range operation. The necessary information is also present * in the extents status tree, but might be more rapidly accessed in * the pending reservation set in many cases due to smaller size. * * The pending cluster reservation set is implemented as a red-black tree * with the goal of minimizing per page search time overhead. */ struct pending_reservation { struct rb_node rb_node; ext4_lblk_t lclu; }; struct ext4_pending_tree { struct rb_root root; }; extern int __init ext4_init_es(void); extern void ext4_exit_es(void); extern void ext4_es_init_tree(struct ext4_es_tree *tree); extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status); extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status); extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); extern void ext4_es_find_extent_range(struct inode *inode, int (*match_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t *next_lblk, struct extent_status *es); extern bool ext4_es_scan_range(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end); extern bool ext4_es_scan_clu(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk); static inline unsigned int ext4_es_status(struct extent_status *es) { return es->es_pblk >> ES_SHIFT; } static inline unsigned int ext4_es_type(struct extent_status *es) { return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT; } static inline int ext4_es_is_written(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0; } static inline int ext4_es_is_unwritten(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0; } static inline int ext4_es_is_delayed(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0; } static inline int ext4_es_is_hole(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; } static inline int ext4_es_is_mapped(struct extent_status *es) { return (ext4_es_is_written(es) || ext4_es_is_unwritten(es)); } static inline int ext4_es_is_delonly(struct extent_status *es) { return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es)); } static inline void ext4_es_set_referenced(struct extent_status *es) { es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; } static inline void ext4_es_clear_referenced(struct extent_status *es) { es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT); } static inline int ext4_es_is_referenced(struct extent_status *es) { return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0; } static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) { return es->es_pblk & ~ES_MASK; } static inline ext4_fsblk_t ext4_es_show_pblock(struct extent_status *es) { ext4_fsblk_t pblock = ext4_es_pblock(es); return pblock == ~ES_MASK ? 0 : pblock; } static inline void ext4_es_store_pblock(struct extent_status *es, ext4_fsblk_t pb) { ext4_fsblk_t block; block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK); es->es_pblk = block; } static inline void ext4_es_store_status(struct extent_status *es, unsigned int status) { es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | (es->es_pblk & ~ES_MASK); } static inline void ext4_es_store_pblock_status(struct extent_status *es, ext4_fsblk_t pb, unsigned int status) { es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | (pb & ~ES_MASK); } extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v); extern int __init ext4_init_pending(void); extern void ext4_exit_pending(void); extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, bool allocated); extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); extern void ext4_clear_inode_es(struct inode *inode); #endif /* _EXT4_EXTENTS_STATUS_H */ |
304 303 27 31 3 160 261 262 263 157 136 58 4 49 15 16 50 114 15 16 36 1 105 114 28 107 98 112 28 25 98 27 27 27 11 26 27 27 27 25 25 1 97 97 15 97 15 97 262 262 3 124 164 75 75 62 55 112 2 112 2 114 114 113 114 25 97 202 21 260 43 261 238 81 81 31 64 64 64 50 17 26 64 22 81 40 44 16 76 76 44 44 115 122 81 57 29 57 72 43 83 83 43 49 36 1 21 21 17 14 44 137 137 137 137 93 19 8 16 41 25 41 83 10 89 20 94 18 25 25 4 8 8 1 9 2 16 2 3 3 5 5 1 4 3 1 4 21 5 21 15 8 5 3 4 3 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/indirect.c * * from * * linux/fs/ext4/inode.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Goal-directed block allocation by Stephen Tweedie * (sct@redhat.com), 1993, 1998 */ #include "ext4_jbd2.h" #include "truncate.h" #include <linux/dax.h> #include <linux/uio.h> #include <trace/events/ext4.h> typedef struct { __le32 *p; __le32 key; struct buffer_head *bh; } Indirect; static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) { p->key = *(p->p = v); p->bh = bh; } /** * ext4_block_to_path - parse the block number into array of offsets * @inode: inode in question (we are only interested in its superblock) * @i_block: block number to be parsed * @offsets: array to store the offsets in * @boundary: set this non-zero if the referred-to block is likely to be * followed (on disk) by an indirect block. * * To store the locations of file's data ext4 uses a data structure common * for UNIX filesystems - tree of pointers anchored in the inode, with * data blocks at leaves and indirect blocks in intermediate nodes. * This function translates the block number into path in that tree - * return value is the path length and @offsets[n] is the offset of * pointer to (n+1)th node in the nth one. If @block is out of range * (negative or too large) warning is printed and zero returned. * * Note: function doesn't find node addresses, so no IO is needed. All * we need to know is the capacity of indirect blocks (taken from the * inode->i_sb). */ /* * Portability note: the last comparison (check that we fit into triple * indirect block) is spelled differently, because otherwise on an * architecture with 32-bit longs and 8Kb pages we might get into trouble * if our filesystem had 8Kb blocks. We might use long long, but that would * kill us on x86. Oh, well, at least the sign propagation does not matter - * i_block would have to be negative in the very beginning, so we would not * get there at all. */ static int ext4_block_to_path(struct inode *inode, ext4_lblk_t i_block, ext4_lblk_t offsets[4], int *boundary) { int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); const long direct_blocks = EXT4_NDIR_BLOCKS, indirect_blocks = ptrs, double_blocks = (1 << (ptrs_bits * 2)); int n = 0; int final = 0; if (i_block < direct_blocks) { offsets[n++] = i_block; final = direct_blocks; } else if ((i_block -= direct_blocks) < indirect_blocks) { offsets[n++] = EXT4_IND_BLOCK; offsets[n++] = i_block; final = ptrs; } else if ((i_block -= indirect_blocks) < double_blocks) { offsets[n++] = EXT4_DIND_BLOCK; offsets[n++] = i_block >> ptrs_bits; offsets[n++] = i_block & (ptrs - 1); final = ptrs; } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { offsets[n++] = EXT4_TIND_BLOCK; offsets[n++] = i_block >> (ptrs_bits * 2); offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); offsets[n++] = i_block & (ptrs - 1); final = ptrs; } else { ext4_warning(inode->i_sb, "block %lu > max in inode %lu", i_block + direct_blocks + indirect_blocks + double_blocks, inode->i_ino); } if (boundary) *boundary = final - 1 - (i_block & (ptrs - 1)); return n; } /** * ext4_get_branch - read the chain of indirect blocks leading to data * @inode: inode in question * @depth: depth of the chain (1 - direct pointer, etc.) * @offsets: offsets of pointers in inode/indirect blocks * @chain: place to store the result * @err: here we store the error value * * Function fills the array of triples <key, p, bh> and returns %NULL * if everything went OK or the pointer to the last filled triple * (incomplete one) otherwise. Upon the return chain[i].key contains * the number of (i+1)-th block in the chain (as it is stored in memory, * i.e. little-endian 32-bit), chain[i].p contains the address of that * number (it points into struct inode for i==0 and into the bh->b_data * for i>0) and chain[i].bh points to the buffer_head of i-th indirect * block for i>0 and NULL for i==0. In other words, it holds the block * numbers of the chain, addresses they were taken from (and where we can * verify that chain did not change) and buffer_heads hosting these * numbers. * * Function stops when it stumbles upon zero pointer (absent block) * (pointer to last triple returned, *@err == 0) * or when it gets an IO error reading an indirect block * (ditto, *@err == -EIO) * or when it reads all @depth-1 indirect blocks successfully and finds * the whole chain, all way to the data (returns %NULL, *err == 0). * * Need to be called with * down_read(&EXT4_I(inode)->i_data_sem) */ static Indirect *ext4_get_branch(struct inode *inode, int depth, ext4_lblk_t *offsets, Indirect chain[4], int *err) { struct super_block *sb = inode->i_sb; Indirect *p = chain; struct buffer_head *bh; unsigned int key; int ret = -EIO; *err = 0; /* i_data is not going away, no lock needed */ add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); if (!p->key) goto no_block; while (--depth) { key = le32_to_cpu(p->key); if (key > ext4_blocks_count(EXT4_SB(sb)->s_es)) { /* the block was out of range */ ret = -EFSCORRUPTED; goto failure; } bh = sb_getblk(sb, key); if (unlikely(!bh)) { ret = -ENOMEM; goto failure; } if (!bh_uptodate_or_lock(bh)) { if (ext4_read_bh(bh, 0, NULL) < 0) { put_bh(bh); goto failure; } /* validate block references */ if (ext4_check_indirect_blockref(inode, bh)) { put_bh(bh); goto failure; } } add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); /* Reader: end */ if (!p->key) goto no_block; } return NULL; failure: *err = ret; no_block: return p; } /** * ext4_find_near - find a place for allocation with sufficient locality * @inode: owner * @ind: descriptor of indirect block. * * This function returns the preferred place for block allocation. * It is used when heuristic for sequential allocation fails. * Rules are: * + if there is a block to the left of our position - allocate near it. * + if pointer will live in indirect block - allocate near that block. * + if pointer will live in inode - allocate in the same * cylinder group. * * In the latter case we colour the starting block by the callers PID to * prevent it from clashing with concurrent allocations for a different inode * in the same block group. The PID is used here so that functionally related * files will be close-by on-disk. * * Caller must make sure that @ind is valid and will stay that way. */ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) { struct ext4_inode_info *ei = EXT4_I(inode); __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; __le32 *p; /* Try to find previous block */ for (p = ind->p - 1; p >= start; p--) { if (*p) return le32_to_cpu(*p); } /* No such thing, so let's try location of indirect block */ if (ind->bh) return ind->bh->b_blocknr; /* * It is going to be referred to from the inode itself? OK, just put it * into the same cylinder group then. */ return ext4_inode_to_goal_block(inode); } /** * ext4_find_goal - find a preferred place for allocation. * @inode: owner * @block: block we want * @partial: pointer to the last triple within a chain * * Normally this function find the preferred place for block allocation, * returns it. * Because this is only used for non-extent files, we limit the block nr * to 32 bits. */ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, Indirect *partial) { ext4_fsblk_t goal; /* * XXX need to get goal block from mballoc's data structures */ goal = ext4_find_near(inode, partial); goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; return goal; } /** * ext4_blks_to_allocate - Look up the block map and count the number * of direct blocks need to be allocated for the given branch. * * @branch: chain of indirect blocks * @k: number of blocks need for indirect blocks * @blks: number of data blocks to be mapped. * @blocks_to_boundary: the offset in the indirect block * * return the total number of blocks to be allocate, including the * direct and indirect blocks. */ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, int blocks_to_boundary) { unsigned int count = 0; /* * Simple case, [t,d]Indirect block(s) has not allocated yet * then it's clear blocks on that path have not allocated */ if (k > 0) { /* right now we don't handle cross boundary allocation */ if (blks < blocks_to_boundary + 1) count += blks; else count += blocks_to_boundary + 1; return count; } count++; while (count < blks && count <= blocks_to_boundary && le32_to_cpu(*(branch[0].p + count)) == 0) { count++; } return count; } /** * ext4_alloc_branch() - allocate and set up a chain of blocks * @handle: handle for this transaction * @ar: structure describing the allocation request * @indirect_blks: number of allocated indirect blocks * @offsets: offsets (in the blocks) to store the pointers to next. * @branch: place to store the chain in. * * This function allocates blocks, zeroes out all but the last one, * links them into chain and (if we are synchronous) writes them to disk. * In other words, it prepares a branch that can be spliced onto the * inode. It stores the information about that chain in the branch[], in * the same format as ext4_get_branch() would do. We are calling it after * we had read the existing part of chain and partial points to the last * triple of that (one with zero ->key). Upon the exit we have the same * picture as after the successful ext4_get_block(), except that in one * place chain is disconnected - *branch->p is still zero (we did not * set the last link), but branch->key contains the number that should * be placed into *branch->p to fill that gap. * * If allocation fails we free all blocks we've allocated (and forget * their buffer_heads) and return the error value the from failed * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain * as described above and return 0. */ static int ext4_alloc_branch(handle_t *handle, struct ext4_allocation_request *ar, int indirect_blks, ext4_lblk_t *offsets, Indirect *branch) { struct buffer_head * bh; ext4_fsblk_t b, new_blocks[4]; __le32 *p; int i, j, err, len = 1; for (i = 0; i <= indirect_blks; i++) { if (i == indirect_blks) { new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err); } else { ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle, ar->inode, ar->goal, ar->flags & EXT4_MB_DELALLOC_RESERVED, NULL, &err); /* Simplify error cleanup... */ branch[i+1].bh = NULL; } if (err) { i--; goto failed; } branch[i].key = cpu_to_le32(new_blocks[i]); if (i == 0) continue; bh = branch[i].bh = sb_getblk(ar->inode->i_sb, new_blocks[i-1]); if (unlikely(!bh)) { err = -ENOMEM; goto failed; } lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); err = ext4_journal_get_create_access(handle, ar->inode->i_sb, bh, EXT4_JTR_NONE); if (err) { unlock_buffer(bh); goto failed; } memset(bh->b_data, 0, bh->b_size); p = branch[i].p = (__le32 *) bh->b_data + offsets[i]; b = new_blocks[i]; if (i == indirect_blks) len = ar->len; for (j = 0; j < len; j++) *p++ = cpu_to_le32(b++); BUFFER_TRACE(bh, "marking uptodate"); set_buffer_uptodate(bh); unlock_buffer(bh); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, ar->inode, bh); if (err) goto failed; } return 0; failed: if (i == indirect_blks) { /* Free data blocks */ ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i], ar->len, 0); i--; } for (; i >= 0; i--) { /* * We want to ext4_forget() only freshly allocated indirect * blocks. Buffer for new_blocks[i] is at branch[i+1].bh * (buffer at branch[0].bh is indirect block / inode already * existing before ext4_alloc_branch() was called). Also * because blocks are freshly allocated, we don't need to * revoke them which is why we don't set * EXT4_FREE_BLOCKS_METADATA. */ ext4_free_blocks(handle, ar->inode, branch[i+1].bh, new_blocks[i], 1, branch[i+1].bh ? EXT4_FREE_BLOCKS_FORGET : 0); } return err; } /** * ext4_splice_branch() - splice the allocated branch onto inode. * @handle: handle for this transaction * @ar: structure describing the allocation request * @where: location of missing link * @num: number of indirect blocks we are adding * * This function fills the missing link and does all housekeeping needed in * inode (->i_blocks, etc.). In case of success we end up with the full * chain to new block and return 0. */ static int ext4_splice_branch(handle_t *handle, struct ext4_allocation_request *ar, Indirect *where, int num) { int i; int err = 0; ext4_fsblk_t current_block; /* * If we're splicing into a [td]indirect block (as opposed to the * inode) then we need to get write access to the [td]indirect block * before the splice. */ if (where->bh) { BUFFER_TRACE(where->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, ar->inode->i_sb, where->bh, EXT4_JTR_NONE); if (err) goto err_out; } /* That's it */ *where->p = where->key; /* * Update the host buffer_head or inode to point to more just allocated * direct blocks blocks */ if (num == 0 && ar->len > 1) { current_block = le32_to_cpu(where->key) + 1; for (i = 1; i < ar->len; i++) *(where->p + i) = cpu_to_le32(current_block++); } /* We are done with atomic stuff, now do the rest of housekeeping */ /* had we spliced it onto indirect block? */ if (where->bh) { /* * If we spliced it onto an indirect block, we haven't * altered the inode. Note however that if it is being spliced * onto an indirect block at the very end of the file (the * file is growing) then we *will* alter the inode to reflect * the new i_size. But that is not done here - it is done in * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. */ ext4_debug("splicing indirect only\n"); BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh); if (err) goto err_out; } else { /* * OK, we spliced it into the inode itself on a direct block. */ err = ext4_mark_inode_dirty(handle, ar->inode); if (unlikely(err)) goto err_out; ext4_debug("splicing direct\n"); } return err; err_out: for (i = 1; i <= num; i++) { /* * branch[i].bh is newly allocated, so there is no * need to revoke the block, which is why we don't * need to set EXT4_FREE_BLOCKS_METADATA. */ ext4_free_blocks(handle, ar->inode, where[i].bh, 0, 1, EXT4_FREE_BLOCKS_FORGET); } ext4_free_blocks(handle, ar->inode, NULL, le32_to_cpu(where[num].key), ar->len, 0); return err; } /* * The ext4_ind_map_blocks() function handles non-extents inodes * (i.e., using the traditional indirect/double-indirect i_blocks * scheme) for ext4_map_blocks(). * * Allocation strategy is simple: if we have to allocate something, we will * have to go the whole way to leaf. So let's do it before attaching anything * to tree, set linkage between the newborn blocks, write them if sync is * required, recheck the path, free and repeat if check fails, otherwise * set the last missing link (that will protect us from any truncate-generated * removals - all blocks on the path are immune now) and possibly force the * write on the parent block. * That has a nice additional property: no special recovery from the failed * allocations is needed - we simply release blocks and do not touch anything * reachable from inode. * * `handle' can be NULL if create == 0. * * return > 0, # of blocks mapped or allocated. * return = 0, if plain lookup failed. * return < 0, error case. * * The ext4_ind_get_blocks() function should be called with * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system * blocks. */ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct ext4_allocation_request ar; int err = -EIO; ext4_lblk_t offsets[4]; Indirect chain[4]; Indirect *partial; int indirect_blks; int blocks_to_boundary = 0; int depth; int count = 0; ext4_fsblk_t first_block = 0; trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); depth = ext4_block_to_path(inode, map->m_lblk, offsets, &blocks_to_boundary); if (depth == 0) goto out; partial = ext4_get_branch(inode, depth, offsets, chain, &err); /* Simplest case - block found, no allocation needed */ if (!partial) { first_block = le32_to_cpu(chain[depth - 1].key); count++; /*map more blocks*/ while (count < map->m_len && count <= blocks_to_boundary) { ext4_fsblk_t blk; blk = le32_to_cpu(*(chain[depth-1].p + count)); if (blk == first_block + count) count++; else break; } goto got_it; } /* Next simple case - plain lookup failed */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { unsigned epb = inode->i_sb->s_blocksize / sizeof(u32); int i; /* * Count number blocks in a subtree under 'partial'. At each * level we count number of complete empty subtrees beyond * current offset and then descend into the subtree only * partially beyond current offset. */ count = 0; for (i = partial - chain + 1; i < depth; i++) count = count * epb + (epb - offsets[i] - 1); count++; /* Fill in size of a hole we found */ map->m_pblk = 0; map->m_len = min_t(unsigned int, map->m_len, count); goto cleanup; } /* Failed read of indirect block */ if (err == -EIO) goto cleanup; /* * Okay, we need to do block allocation. */ if (ext4_has_feature_bigalloc(inode->i_sb)) { EXT4_ERROR_INODE(inode, "Can't allocate blocks for " "non-extent mapped inodes with bigalloc"); err = -EFSCORRUPTED; goto out; } /* Set up for the direct block allocation */ memset(&ar, 0, sizeof(ar)); ar.inode = inode; ar.logical = map->m_lblk; if (S_ISREG(inode->i_mode)) ar.flags = EXT4_MB_HINT_DATA; if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ar.flags |= EXT4_MB_DELALLOC_RESERVED; if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) ar.flags |= EXT4_MB_USE_RESERVED; ar.goal = ext4_find_goal(inode, map->m_lblk, partial); /* the number of blocks need to allocate for [d,t]indirect blocks */ indirect_blks = (chain + depth) - partial - 1; /* * Next look up the indirect map to count the totoal number of * direct blocks to allocate for this branch. */ ar.len = ext4_blks_to_allocate(partial, indirect_blks, map->m_len, blocks_to_boundary); /* * Block out ext4_truncate while we alter the tree */ err = ext4_alloc_branch(handle, &ar, indirect_blks, offsets + (partial - chain), partial); /* * The ext4_splice_branch call will free and forget any buffers * on the new chain if there is a failure, but that risks using * up transaction credits, especially for bitmaps where the * credits cannot be returned. Can we handle this somehow? We * may need to return -EAGAIN upwards in the worst case. --sct */ if (!err) err = ext4_splice_branch(handle, &ar, partial, indirect_blks); if (err) goto cleanup; map->m_flags |= EXT4_MAP_NEW; ext4_update_inode_fsync_trans(handle, inode, 1); count = ar.len; /* * Update reserved blocks/metadata blocks after successful block * allocation which had been deferred till now. */ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ext4_da_update_reserve_space(inode, count, 1); got_it: map->m_flags |= EXT4_MAP_MAPPED; map->m_pblk = le32_to_cpu(chain[depth-1].key); map->m_len = count; if (count > blocks_to_boundary) map->m_flags |= EXT4_MAP_BOUNDARY; err = count; /* Clean up and exit */ partial = chain + depth - 1; /* the whole chain */ cleanup: while (partial > chain) { BUFFER_TRACE(partial->bh, "call brelse"); brelse(partial->bh); partial--; } out: trace_ext4_ind_map_blocks_exit(inode, flags, map, err); return err; } /* * Calculate number of indirect blocks touched by mapping @nrblocks logically * contiguous blocks */ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks) { /* * With N contiguous data blocks, we need at most * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, * 2 dindirect blocks, and 1 tindirect block */ return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; } static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode, struct buffer_head *bh, int *dropped) { int err; if (bh) { BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, inode, bh); if (unlikely(err)) return err; } err = ext4_mark_inode_dirty(handle, inode); if (unlikely(err)) return err; /* * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this * moment, get_block can be called only for blocks inside i_size since * page cache has been already dropped and writes are blocked by * i_rwsem. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); ext4_discard_preallocations(inode, 0); up_write(&EXT4_I(inode)->i_data_sem); *dropped = 1; return 0; } /* * Truncate transactions can be complex and absolutely huge. So we need to * be able to restart the transaction at a convenient checkpoint to make * sure we don't overflow the journal. * * Try to extend this transaction for the purposes of truncation. If * extend fails, we restart transaction. */ static int ext4_ind_truncate_ensure_credits(handle_t *handle, struct inode *inode, struct buffer_head *bh, int revoke_creds) { int ret; int dropped = 0; ret = ext4_journal_ensure_credits_fn(handle, EXT4_RESERVE_TRANS_BLOCKS, ext4_blocks_for_truncate(inode), revoke_creds, ext4_ind_trunc_restart_fn(handle, inode, bh, &dropped)); if (dropped) down_write(&EXT4_I(inode)->i_data_sem); if (ret <= 0) return ret; if (bh) { BUFFER_TRACE(bh, "retaking write access"); ret = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (unlikely(ret)) return ret; } return 0; } /* * Probably it should be a library function... search for first non-zero word * or memcmp with zero_page, whatever is better for particular architecture. * Linus? */ static inline int all_zeroes(__le32 *p, __le32 *q) { while (p < q) if (*p++) return 0; return 1; } /** * ext4_find_shared - find the indirect blocks for partial truncation. * @inode: inode in question * @depth: depth of the affected branch * @offsets: offsets of pointers in that branch (see ext4_block_to_path) * @chain: place to store the pointers to partial indirect blocks * @top: place to the (detached) top of branch * * This is a helper function used by ext4_truncate(). * * When we do truncate() we may have to clean the ends of several * indirect blocks but leave the blocks themselves alive. Block is * partially truncated if some data below the new i_size is referred * from it (and it is on the path to the first completely truncated * data block, indeed). We have to free the top of that path along * with everything to the right of the path. Since no allocation * past the truncation point is possible until ext4_truncate() * finishes, we may safely do the latter, but top of branch may * require special attention - pageout below the truncation point * might try to populate it. * * We atomically detach the top of branch from the tree, store the * block number of its root in *@top, pointers to buffer_heads of * partially truncated blocks - in @chain[].bh and pointers to * their last elements that should not be removed - in * @chain[].p. Return value is the pointer to last filled element * of @chain. * * The work left to caller to do the actual freeing of subtrees: * a) free the subtree starting from *@top * b) free the subtrees whose roots are stored in * (@chain[i].p+1 .. end of @chain[i].bh->b_data) * c) free the subtrees growing from the inode past the @chain[0]. * (no partially truncated stuff there). */ static Indirect *ext4_find_shared(struct inode *inode, int depth, ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top) { Indirect *partial, *p; int k, err; *top = 0; /* Make k index the deepest non-null offset + 1 */ for (k = depth; k > 1 && !offsets[k-1]; k--) ; partial = ext4_get_branch(inode, k, offsets, chain, &err); /* Writer: pointers */ if (!partial) partial = chain + k-1; /* * If the branch acquired continuation since we've looked at it - * fine, it should all survive and (new) top doesn't belong to us. */ if (!partial->key && *partial->p) /* Writer: end */ goto no_top; for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) ; /* * OK, we've found the last block that must survive. The rest of our * branch should be detached before unlocking. However, if that rest * of branch is all ours and does not grow immediately from the inode * it's easier to cheat and just decrement partial->p. */ if (p == chain + k - 1 && p > chain) { p->p--; } else { *top = *p->p; /* Nope, don't do this in ext4. Must leave the tree intact */ #if 0 *p->p = 0; #endif } /* Writer: end */ while (partial > p) { brelse(partial->bh); partial--; } no_top: return partial; } /* * Zero a number of block pointers in either an inode or an indirect block. * If we restart the transaction we must again get write access to the * indirect block for further modification. * * We release `count' blocks on disk, but (last - first) may be greater * than `count' because there can be holes in there. * * Return 0 on success, 1 on invalid block range * and < 0 on fatal error. */ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block_to_free, unsigned long count, __le32 *first, __le32 *last) { __le32 *p; int flags = EXT4_FREE_BLOCKS_VALIDATED; int err; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE)) flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA; else if (ext4_should_journal_data(inode)) flags |= EXT4_FREE_BLOCKS_FORGET; if (!ext4_inode_block_valid(inode, block_to_free, count)) { EXT4_ERROR_INODE(inode, "attempt to clear invalid " "blocks %llu len %lu", (unsigned long long) block_to_free, count); return 1; } err = ext4_ind_truncate_ensure_credits(handle, inode, bh, ext4_free_data_revoke_credits(inode, count)); if (err < 0) goto out_err; for (p = first; p < last; p++) *p = 0; ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); return 0; out_err: ext4_std_error(inode->i_sb, err); return err; } /** * ext4_free_data - free a list of data blocks * @handle: handle for this transaction * @inode: inode we are dealing with * @this_bh: indirect buffer_head which contains *@first and *@last * @first: array of block numbers * @last: points immediately past the end of array * * We are freeing all blocks referred from that array (numbers are stored as * little-endian 32-bit) and updating @inode->i_blocks appropriately. * * We accumulate contiguous runs of blocks to free. Conveniently, if these * blocks are contiguous then releasing them at one time will only affect one * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't * actually use a lot of journal space. * * @this_bh will be %NULL if @first and @last point into the inode's direct * block pointers. */ static void ext4_free_data(handle_t *handle, struct inode *inode, struct buffer_head *this_bh, __le32 *first, __le32 *last) { ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ unsigned long count = 0; /* Number of blocks in the run */ __le32 *block_to_free_p = NULL; /* Pointer into inode/ind corresponding to block_to_free */ ext4_fsblk_t nr; /* Current block # */ __le32 *p; /* Pointer into inode/ind for current block */ int err = 0; if (this_bh) { /* For indirect block */ BUFFER_TRACE(this_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, this_bh, EXT4_JTR_NONE); /* Important: if we can't update the indirect pointers * to the blocks, we can't free them. */ if (err) return; } for (p = first; p < last; p++) { nr = le32_to_cpu(*p); if (nr) { /* accumulate blocks to free if they're contiguous */ if (count == 0) { block_to_free = nr; block_to_free_p = p; count = 1; } else if (nr == block_to_free + count) { count++; } else { err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, count, block_to_free_p, p); if (err) break; block_to_free = nr; block_to_free_p = p; count = 1; } } } if (!err && count > 0) err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, count, block_to_free_p, p); if (err < 0) /* fatal error */ return; if (this_bh) { BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); /* * The buffer head should have an attached journal head at this * point. However, if the data is corrupted and an indirect * block pointed to itself, it would have been detached when * the block was cleared. Check for this instead of OOPSing. */ if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) ext4_handle_dirty_metadata(handle, inode, this_bh); else EXT4_ERROR_INODE(inode, "circular indirect block detected at " "block %llu", (unsigned long long) this_bh->b_blocknr); } } /** * ext4_free_branches - free an array of branches * @handle: JBD handle for this transaction * @inode: inode we are dealing with * @parent_bh: the buffer_head which contains *@first and *@last * @first: array of block numbers * @last: pointer immediately past the end of array * @depth: depth of the branches to free * * We are freeing all blocks referred from these branches (numbers are * stored as little-endian 32-bit) and updating @inode->i_blocks * appropriately. */ static void ext4_free_branches(handle_t *handle, struct inode *inode, struct buffer_head *parent_bh, __le32 *first, __le32 *last, int depth) { ext4_fsblk_t nr; __le32 *p; if (ext4_handle_is_aborted(handle)) return; if (depth--) { struct buffer_head *bh; int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); p = last; while (--p >= first) { nr = le32_to_cpu(*p); if (!nr) continue; /* A hole */ if (!ext4_inode_block_valid(inode, nr, 1)) { EXT4_ERROR_INODE(inode, "invalid indirect mapped " "block %lu (level %d)", (unsigned long) nr, depth); break; } /* Go read the buffer for the next level down */ bh = ext4_sb_bread(inode->i_sb, nr, 0); /* * A read failure? Report error and clear slot * (should be rare). */ if (IS_ERR(bh)) { ext4_error_inode_block(inode, nr, -PTR_ERR(bh), "Read failure"); continue; } /* This zaps the entire block. Bottom up. */ BUFFER_TRACE(bh, "free child branches"); ext4_free_branches(handle, inode, bh, (__le32 *) bh->b_data, (__le32 *) bh->b_data + addr_per_block, depth); brelse(bh); /* * Everything below this pointer has been * released. Now let this top-of-subtree go. * * We want the freeing of this indirect block to be * atomic in the journal with the updating of the * bitmap block which owns it. So make some room in * the journal. * * We zero the parent pointer *after* freeing its * pointee in the bitmaps, so if extend_transaction() * for some reason fails to put the bitmap changes and * the release into the same transaction, recovery * will merely complain about releasing a free block, * rather than leaking blocks. */ if (ext4_handle_is_aborted(handle)) return; if (ext4_ind_truncate_ensure_credits(handle, inode, NULL, ext4_free_metadata_revoke_credits( inode->i_sb, 1)) < 0) return; /* * The forget flag here is critical because if * we are journaling (and not doing data * journaling), we have to make sure a revoke * record is written to prevent the journal * replay from overwriting the (former) * indirect block if it gets reallocated as a * data block. This must happen in the same * transaction where the data blocks are * actually freed. */ ext4_free_blocks(handle, inode, NULL, nr, 1, EXT4_FREE_BLOCKS_METADATA| EXT4_FREE_BLOCKS_FORGET); if (parent_bh) { /* * The block which we have just freed is * pointed to by an indirect block: journal it */ BUFFER_TRACE(parent_bh, "get_write_access"); if (!ext4_journal_get_write_access(handle, inode->i_sb, parent_bh, EXT4_JTR_NONE)) { *p = 0; BUFFER_TRACE(parent_bh, "call ext4_handle_dirty_metadata"); ext4_handle_dirty_metadata(handle, inode, parent_bh); } } } } else { /* We have reached the bottom of the tree. */ BUFFER_TRACE(parent_bh, "free data blocks"); ext4_free_data(handle, inode, parent_bh, first, last); } } void ext4_ind_truncate(handle_t *handle, struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); __le32 *i_data = ei->i_data; int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); ext4_lblk_t offsets[4]; Indirect chain[4]; Indirect *partial; __le32 nr = 0; int n = 0; ext4_lblk_t last_block, max_block; unsigned blocksize = inode->i_sb->s_blocksize; last_block = (inode->i_size + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); if (last_block != max_block) { n = ext4_block_to_path(inode, last_block, offsets, NULL); if (n == 0) return; } ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); /* * The orphan list entry will now protect us from any crash which * occurs before the truncate completes, so it is now safe to propagate * the new, shorter inode size (held for now in i_size) into the * on-disk inode. We do this via i_disksize, which is the value which * ext4 *really* writes onto the disk inode. */ ei->i_disksize = inode->i_size; if (last_block == max_block) { /* * It is unnecessary to free any data blocks if last_block is * equal to the indirect block limit. */ return; } else if (n == 1) { /* direct blocks */ ext4_free_data(handle, inode, NULL, i_data+offsets[0], i_data + EXT4_NDIR_BLOCKS); goto do_indirects; } partial = ext4_find_shared(inode, n, offsets, chain, &nr); /* Kill the top of shared branch (not detached) */ if (nr) { if (partial == chain) { /* Shared branch grows from the inode */ ext4_free_branches(handle, inode, NULL, &nr, &nr+1, (chain+n-1) - partial); *partial->p = 0; /* * We mark the inode dirty prior to restart, * and prior to stop. No need for it here. */ } else { /* Shared branch grows from an indirect block */ BUFFER_TRACE(partial->bh, "get_write_access"); ext4_free_branches(handle, inode, partial->bh, partial->p, partial->p+1, (chain+n-1) - partial); } } /* Clear the ends of indirect blocks on the shared branch */ while (partial > chain) { ext4_free_branches(handle, inode, partial->bh, partial->p + 1, (__le32*)partial->bh->b_data+addr_per_block, (chain+n-1) - partial); BUFFER_TRACE(partial->bh, "call brelse"); brelse(partial->bh); partial--; } do_indirects: /* Kill the remaining (whole) subtrees */ switch (offsets[0]) { default: nr = i_data[EXT4_IND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); i_data[EXT4_IND_BLOCK] = 0; } fallthrough; case EXT4_IND_BLOCK: nr = i_data[EXT4_DIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); i_data[EXT4_DIND_BLOCK] = 0; } fallthrough; case EXT4_DIND_BLOCK: nr = i_data[EXT4_TIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); i_data[EXT4_TIND_BLOCK] = 0; } fallthrough; case EXT4_TIND_BLOCK: ; } } /** * ext4_ind_remove_space - remove space from the range * @handle: JBD handle for this transaction * @inode: inode we are dealing with * @start: First block to remove * @end: One block after the last block to remove (exclusive) * * Free the blocks in the defined range (end is exclusive endpoint of * range). This is used by ext4_punch_hole(). */ int ext4_ind_remove_space(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end) { struct ext4_inode_info *ei = EXT4_I(inode); __le32 *i_data = ei->i_data; int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); ext4_lblk_t offsets[4], offsets2[4]; Indirect chain[4], chain2[4]; Indirect *partial, *partial2; Indirect *p = NULL, *p2 = NULL; ext4_lblk_t max_block; __le32 nr = 0, nr2 = 0; int n = 0, n2 = 0; unsigned blocksize = inode->i_sb->s_blocksize; max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); if (end >= max_block) end = max_block; if ((start >= end) || (start > max_block)) return 0; n = ext4_block_to_path(inode, start, offsets, NULL); n2 = ext4_block_to_path(inode, end, offsets2, NULL); BUG_ON(n > n2); if ((n == 1) && (n == n2)) { /* We're punching only within direct block range */ ext4_free_data(handle, inode, NULL, i_data + offsets[0], i_data + offsets2[0]); return 0; } else if (n2 > n) { /* * Start and end are on a different levels so we're going to * free partial block at start, and partial block at end of * the range. If there are some levels in between then * do_indirects label will take care of that. */ if (n == 1) { /* * Start is at the direct block level, free * everything to the end of the level. */ ext4_free_data(handle, inode, NULL, i_data + offsets[0], i_data + EXT4_NDIR_BLOCKS); goto end_range; } partial = p = ext4_find_shared(inode, n, offsets, chain, &nr); if (nr) { if (partial == chain) { /* Shared branch grows from the inode */ ext4_free_branches(handle, inode, NULL, &nr, &nr+1, (chain+n-1) - partial); *partial->p = 0; } else { /* Shared branch grows from an indirect block */ BUFFER_TRACE(partial->bh, "get_write_access"); ext4_free_branches(handle, inode, partial->bh, partial->p, partial->p+1, (chain+n-1) - partial); } } /* * Clear the ends of indirect blocks on the shared branch * at the start of the range */ while (partial > chain) { ext4_free_branches(handle, inode, partial->bh, partial->p + 1, (__le32 *)partial->bh->b_data+addr_per_block, (chain+n-1) - partial); partial--; } end_range: partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); if (nr2) { if (partial2 == chain2) { /* * Remember, end is exclusive so here we're at * the start of the next level we're not going * to free. Everything was covered by the start * of the range. */ goto do_indirects; } } else { /* * ext4_find_shared returns Indirect structure which * points to the last element which should not be * removed by truncate. But this is end of the range * in punch_hole so we need to point to the next element */ partial2->p++; } /* * Clear the ends of indirect blocks on the shared branch * at the end of the range */ while (partial2 > chain2) { ext4_free_branches(handle, inode, partial2->bh, (__le32 *)partial2->bh->b_data, partial2->p, (chain2+n2-1) - partial2); partial2--; } goto do_indirects; } /* Punch happened within the same level (n == n2) */ partial = p = ext4_find_shared(inode, n, offsets, chain, &nr); partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); /* Free top, but only if partial2 isn't its subtree. */ if (nr) { int level = min(partial - chain, partial2 - chain2); int i; int subtree = 1; for (i = 0; i <= level; i++) { if (offsets[i] != offsets2[i]) { subtree = 0; break; } } if (!subtree) { if (partial == chain) { /* Shared branch grows from the inode */ ext4_free_branches(handle, inode, NULL, &nr, &nr+1, (chain+n-1) - partial); *partial->p = 0; } else { /* Shared branch grows from an indirect block */ BUFFER_TRACE(partial->bh, "get_write_access"); ext4_free_branches(handle, inode, partial->bh, partial->p, partial->p+1, (chain+n-1) - partial); } } } if (!nr2) { /* * ext4_find_shared returns Indirect structure which * points to the last element which should not be * removed by truncate. But this is end of the range * in punch_hole so we need to point to the next element */ partial2->p++; } while (partial > chain || partial2 > chain2) { int depth = (chain+n-1) - partial; int depth2 = (chain2+n2-1) - partial2; if (partial > chain && partial2 > chain2 && partial->bh->b_blocknr == partial2->bh->b_blocknr) { /* * We've converged on the same block. Clear the range, * then we're done. */ ext4_free_branches(handle, inode, partial->bh, partial->p + 1, partial2->p, (chain+n-1) - partial); goto cleanup; } /* * The start and end partial branches may not be at the same * level even though the punch happened within one level. So, we * give them a chance to arrive at the same level, then walk * them in step with each other until we converge on the same * block. */ if (partial > chain && depth <= depth2) { ext4_free_branches(handle, inode, partial->bh, partial->p + 1, (__le32 *)partial->bh->b_data+addr_per_block, (chain+n-1) - partial); partial--; } if (partial2 > chain2 && depth2 <= depth) { ext4_free_branches(handle, inode, partial2->bh, (__le32 *)partial2->bh->b_data, partial2->p, (chain2+n2-1) - partial2); partial2--; } } cleanup: while (p && p > chain) { BUFFER_TRACE(p->bh, "call brelse"); brelse(p->bh); p--; } while (p2 && p2 > chain2) { BUFFER_TRACE(p2->bh, "call brelse"); brelse(p2->bh); p2--; } return 0; do_indirects: /* Kill the remaining (whole) subtrees */ switch (offsets[0]) { default: if (++n >= n2) break; nr = i_data[EXT4_IND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); i_data[EXT4_IND_BLOCK] = 0; } fallthrough; case EXT4_IND_BLOCK: if (++n >= n2) break; nr = i_data[EXT4_DIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); i_data[EXT4_DIND_BLOCK] = 0; } fallthrough; case EXT4_DIND_BLOCK: if (++n >= n2) break; nr = i_data[EXT4_TIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); i_data[EXT4_TIND_BLOCK] = 0; } fallthrough; case EXT4_TIND_BLOCK: ; } goto cleanup; } |
1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 | // SPDX-License-Identifier: GPL-2.0 OR MIT /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * * This is based in part on Andrew Moon's poly1305-donna, which is in the * public domain. */ #include <linux/kernel.h> #include <asm/unaligned.h> #include <crypto/internal/poly1305.h> typedef __uint128_t u128; void poly1305_core_setkey(struct poly1305_core_key *key, const u8 raw_key[POLY1305_BLOCK_SIZE]) { u64 t0, t1; /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ t0 = get_unaligned_le64(&raw_key[0]); t1 = get_unaligned_le64(&raw_key[8]); key->key.r64[0] = t0 & 0xffc0fffffffULL; key->key.r64[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffffULL; key->key.r64[2] = ((t1 >> 24)) & 0x00ffffffc0fULL; /* s = 20*r */ key->precomputed_s.r64[0] = key->key.r64[1] * 20; key->precomputed_s.r64[1] = key->key.r64[2] * 20; } EXPORT_SYMBOL(poly1305_core_setkey); void poly1305_core_blocks(struct poly1305_state *state, const struct poly1305_core_key *key, const void *src, unsigned int nblocks, u32 hibit) { const u8 *input = src; u64 hibit64; u64 r0, r1, r2; u64 s1, s2; u64 h0, h1, h2; u64 c; u128 d0, d1, d2, d; if (!nblocks) return; hibit64 = ((u64)hibit) << 40; r0 = key->key.r64[0]; r1 = key->key.r64[1]; r2 = key->key.r64[2]; h0 = state->h64[0]; h1 = state->h64[1]; h2 = state->h64[2]; s1 = key->precomputed_s.r64[0]; s2 = key->precomputed_s.r64[1]; do { u64 t0, t1; /* h += m[i] */ t0 = get_unaligned_le64(&input[0]); t1 = get_unaligned_le64(&input[8]); h0 += t0 & 0xfffffffffffULL; h1 += ((t0 >> 44) | (t1 << 20)) & 0xfffffffffffULL; h2 += (((t1 >> 24)) & 0x3ffffffffffULL) | hibit64; /* h *= r */ d0 = (u128)h0 * r0; d = (u128)h1 * s2; d0 += d; d = (u128)h2 * s1; d0 += d; d1 = (u128)h0 * r1; d = (u128)h1 * r0; d1 += d; d = (u128)h2 * s2; d1 += d; d2 = (u128)h0 * r2; d = (u128)h1 * r1; d2 += d; d = (u128)h2 * r0; d2 += d; /* (partial) h %= p */ c = (u64)(d0 >> 44); h0 = (u64)d0 & 0xfffffffffffULL; d1 += c; c = (u64)(d1 >> 44); h1 = (u64)d1 & 0xfffffffffffULL; d2 += c; c = (u64)(d2 >> 42); h2 = (u64)d2 & 0x3ffffffffffULL; h0 += c * 5; c = h0 >> 44; h0 = h0 & 0xfffffffffffULL; h1 += c; input += POLY1305_BLOCK_SIZE; } while (--nblocks); state->h64[0] = h0; state->h64[1] = h1; state->h64[2] = h2; } EXPORT_SYMBOL(poly1305_core_blocks); void poly1305_core_emit(const struct poly1305_state *state, const u32 nonce[4], void *dst) { u8 *mac = dst; u64 h0, h1, h2, c; u64 g0, g1, g2; u64 t0, t1; /* fully carry h */ h0 = state->h64[0]; h1 = state->h64[1]; h2 = state->h64[2]; c = h1 >> 44; h1 &= 0xfffffffffffULL; h2 += c; c = h2 >> 42; h2 &= 0x3ffffffffffULL; h0 += c * 5; c = h0 >> 44; h0 &= 0xfffffffffffULL; h1 += c; c = h1 >> 44; h1 &= 0xfffffffffffULL; h2 += c; c = h2 >> 42; h2 &= 0x3ffffffffffULL; h0 += c * 5; c = h0 >> 44; h0 &= 0xfffffffffffULL; h1 += c; /* compute h + -p */ g0 = h0 + 5; c = g0 >> 44; g0 &= 0xfffffffffffULL; g1 = h1 + c; c = g1 >> 44; g1 &= 0xfffffffffffULL; g2 = h2 + c - (1ULL << 42); /* select h if h < p, or h + -p if h >= p */ c = (g2 >> ((sizeof(u64) * 8) - 1)) - 1; g0 &= c; g1 &= c; g2 &= c; c = ~c; h0 = (h0 & c) | g0; h1 = (h1 & c) | g1; h2 = (h2 & c) | g2; if (likely(nonce)) { /* h = (h + nonce) */ t0 = ((u64)nonce[1] << 32) | nonce[0]; t1 = ((u64)nonce[3] << 32) | nonce[2]; h0 += t0 & 0xfffffffffffULL; c = h0 >> 44; h0 &= 0xfffffffffffULL; h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffffULL) + c; c = h1 >> 44; h1 &= 0xfffffffffffULL; h2 += (((t1 >> 24)) & 0x3ffffffffffULL) + c; h2 &= 0x3ffffffffffULL; } /* mac = h % (2^128) */ h0 = h0 | (h1 << 44); h1 = (h1 >> 20) | (h2 << 24); put_unaligned_le64(h0, &mac[0]); put_unaligned_le64(h1, &mac[8]); } EXPORT_SYMBOL(poly1305_core_emit); |
8299 3545 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef IOPRIO_H #define IOPRIO_H #include <linux/sched.h> #include <linux/sched/rt.h> #include <linux/iocontext.h> #include <uapi/linux/ioprio.h> /* * Default IO priority. */ #define IOPRIO_DEFAULT IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0) /* * Check that a priority value has a valid class. */ static inline bool ioprio_valid(unsigned short ioprio) { unsigned short class = IOPRIO_PRIO_CLASS(ioprio); return class > IOPRIO_CLASS_NONE && class <= IOPRIO_CLASS_IDLE; } /* * if process has set io priority explicitly, use that. if not, convert * the cpu scheduler nice value to an io priority */ static inline int task_nice_ioprio(struct task_struct *task) { return (task_nice(task) + 20) / 5; } /* * This is for the case where the task hasn't asked for a specific IO class. * Check for idle and rt task process, and return appropriate IO class. */ static inline int task_nice_ioclass(struct task_struct *task) { if (task->policy == SCHED_IDLE) return IOPRIO_CLASS_IDLE; else if (task_is_realtime(task)) return IOPRIO_CLASS_RT; else return IOPRIO_CLASS_BE; } #ifdef CONFIG_BLOCK int __get_task_ioprio(struct task_struct *p); #else static inline int __get_task_ioprio(struct task_struct *p) { return IOPRIO_DEFAULT; } #endif /* CONFIG_BLOCK */ static inline int get_current_ioprio(void) { return __get_task_ioprio(current); } extern int set_task_ioprio(struct task_struct *task, int ioprio); #ifdef CONFIG_BLOCK extern int ioprio_check_cap(int ioprio); #else static inline int ioprio_check_cap(int ioprio) { return -ENOTBLK; } #endif /* CONFIG_BLOCK */ #endif |
425 425 419 6 425 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 | // SPDX-License-Identifier: GPL-2.0 /* -*- linux-c -*- * sysctl_net_core.c: sysctl interface to net core subsystem. * * Begun April 1, 1996, Mike Shaver. * Added /proc/sys/net/core directory entry (empty =) ). [MS] */ #include <linux/filter.h> #include <linux/mm.h> #include <linux/sysctl.h> #include <linux/module.h> #include <linux/socket.h> #include <linux/netdevice.h> #include <linux/ratelimit.h> #include <linux/vmalloc.h> #include <linux/init.h> #include <linux/slab.h> #include <net/ip.h> #include <net/sock.h> #include <net/net_ratelimit.h> #include <net/busy_poll.h> #include <net/pkt_sched.h> #include "dev.h" static int int_3600 = 3600; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; static int min_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE; static int net_msg_warn; /* Unused, but still a sysctl */ int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0; EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); /* 0 - Keep current behavior: * IPv4: inherit all current settings from init_net * IPv6: reset all settings to default * 1 - Both inherit all current settings from init_net * 2 - Both reset all settings to default * 3 - Both inherit all settings from current netns */ int sysctl_devconf_inherit_init_net __read_mostly; EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); #ifdef CONFIG_RPS static int rps_sock_flow_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned int orig_size, size; int ret, i; struct ctl_table tmp = { .data = &size, .maxlen = sizeof(size), .mode = table->mode }; struct rps_sock_flow_table *orig_sock_table, *sock_table; static DEFINE_MUTEX(sock_flow_mutex); mutex_lock(&sock_flow_mutex); orig_sock_table = rcu_dereference_protected(rps_sock_flow_table, lockdep_is_held(&sock_flow_mutex)); size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (write) { if (size) { if (size > 1<<29) { /* Enforce limit to prevent overflow */ mutex_unlock(&sock_flow_mutex); return -EINVAL; } size = roundup_pow_of_two(size); if (size != orig_size) { sock_table = vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size)); if (!sock_table) { mutex_unlock(&sock_flow_mutex); return -ENOMEM; } rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; sock_table->mask = size - 1; } else sock_table = orig_sock_table; for (i = 0; i < size; i++) sock_table->ents[i] = RPS_NO_CPU; } else sock_table = NULL; if (sock_table != orig_sock_table) { rcu_assign_pointer(rps_sock_flow_table, sock_table); if (sock_table) { static_branch_inc(&rps_needed); static_branch_inc(&rfs_needed); } if (orig_sock_table) { static_branch_dec(&rps_needed); static_branch_dec(&rfs_needed); kvfree_rcu(orig_sock_table); } } } mutex_unlock(&sock_flow_mutex); return ret; } #endif /* CONFIG_RPS */ #ifdef CONFIG_NET_FLOW_LIMIT static DEFINE_MUTEX(flow_limit_update_mutex); static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct sd_flow_limit *cur; struct softnet_data *sd; cpumask_var_t mask; int i, len, ret = 0; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; if (write) { ret = cpumask_parse(buffer, mask); if (ret) goto done; mutex_lock(&flow_limit_update_mutex); len = sizeof(*cur) + netdev_flow_limit_table_len; for_each_possible_cpu(i) { sd = &per_cpu(softnet_data, i); cur = rcu_dereference_protected(sd->flow_limit, lockdep_is_held(&flow_limit_update_mutex)); if (cur && !cpumask_test_cpu(i, mask)) { RCU_INIT_POINTER(sd->flow_limit, NULL); kfree_rcu(cur); } else if (!cur && cpumask_test_cpu(i, mask)) { cur = kzalloc_node(len, GFP_KERNEL, cpu_to_node(i)); if (!cur) { /* not unwinding previous changes */ ret = -ENOMEM; goto write_unlock; } cur->num_buckets = netdev_flow_limit_table_len; rcu_assign_pointer(sd->flow_limit, cur); } } write_unlock: mutex_unlock(&flow_limit_update_mutex); } else { char kbuf[128]; if (*ppos || !*lenp) { *lenp = 0; goto done; } cpumask_clear(mask); rcu_read_lock(); for_each_possible_cpu(i) { sd = &per_cpu(softnet_data, i); if (rcu_dereference(sd->flow_limit)) cpumask_set_cpu(i, mask); } rcu_read_unlock(); len = min(sizeof(kbuf) - 1, *lenp); len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask)); if (!len) { *lenp = 0; goto done; } if (len < *lenp) kbuf[len++] = '\n'; memcpy(buffer, kbuf, len); *lenp = len; *ppos += len; } done: free_cpumask_var(mask); return ret; } static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned int old, *ptr; int ret; mutex_lock(&flow_limit_update_mutex); ptr = table->data; old = *ptr; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret && write && !is_power_of_2(*ptr)) { *ptr = old; ret = -EINVAL; } mutex_unlock(&flow_limit_update_mutex); return ret; } #endif /* CONFIG_NET_FLOW_LIMIT */ #ifdef CONFIG_NET_SCHED static int set_default_qdisc(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { char id[IFNAMSIZ]; struct ctl_table tbl = { .data = id, .maxlen = IFNAMSIZ, }; int ret; qdisc_get_default(id, IFNAMSIZ); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) ret = qdisc_set_default(id); return ret; } #endif static int proc_do_dev_weight(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { static DEFINE_MUTEX(dev_weight_mutex); int ret, weight; mutex_lock(&dev_weight_mutex); ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret && write) { weight = READ_ONCE(weight_p); WRITE_ONCE(dev_rx_weight, weight * dev_weight_rx_bias); WRITE_ONCE(dev_tx_weight, weight * dev_weight_tx_bias); } mutex_unlock(&dev_weight_mutex); return ret; } static int proc_do_rss_key(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table fake_table; char buf[NETDEV_RSS_KEY_LEN * 3]; snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key); fake_table.data = buf; fake_table.maxlen = sizeof(buf); return proc_dostring(&fake_table, write, buffer, lenp, ppos); } #ifdef CONFIG_BPF_JIT static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret, jit_enable = *(int *)table->data; int min = *(int *)table->extra1; int max = *(int *)table->extra2; struct ctl_table tmp = *table; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; tmp.data = &jit_enable; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && !ret) { if (jit_enable < 2 || (jit_enable == 2 && bpf_dump_raw_ok(current_cred()))) { *(int *)table->data = jit_enable; if (jit_enable == 2) pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n"); } else { ret = -EPERM; } } if (write && ret && min == max) pr_info_once("CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is permanently set to 1.\n"); return ret; } # ifdef CONFIG_HAVE_EBPF_JIT static int proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; return proc_dointvec_minmax(table, write, buffer, lenp, ppos); } # endif /* CONFIG_HAVE_EBPF_JIT */ static int proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif static struct ctl_table net_core_table[] = { { .procname = "wmem_max", .data = &sysctl_wmem_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_sndbuf, }, { .procname = "rmem_max", .data = &sysctl_rmem_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_rcvbuf, }, { .procname = "wmem_default", .data = &sysctl_wmem_default, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_sndbuf, }, { .procname = "rmem_default", .data = &sysctl_rmem_default, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_rcvbuf, }, { .procname = "mem_pcpu_rsv", .data = &sysctl_mem_pcpu_rsv, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_mem_pcpu_rsv, }, { .procname = "dev_weight", .data = &weight_p, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, }, { .procname = "dev_weight_rx_bias", .data = &dev_weight_rx_bias, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, }, { .procname = "dev_weight_tx_bias", .data = &dev_weight_tx_bias, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, }, { .procname = "netdev_max_backlog", .data = &netdev_max_backlog, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "netdev_rss_key", .data = &netdev_rss_key, .maxlen = sizeof(int), .mode = 0444, .proc_handler = proc_do_rss_key, }, #ifdef CONFIG_BPF_JIT { .procname = "bpf_jit_enable", .data = &bpf_jit_enable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax_bpf_enable, # ifdef CONFIG_BPF_JIT_ALWAYS_ON .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_ONE, # else .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, # endif }, # ifdef CONFIG_HAVE_EBPF_JIT { .procname = "bpf_jit_harden", .data = &bpf_jit_harden, .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec_minmax_bpf_restricted, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "bpf_jit_kallsyms", .data = &bpf_jit_kallsyms, .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec_minmax_bpf_restricted, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, # endif { .procname = "bpf_jit_limit", .data = &bpf_jit_limit, .maxlen = sizeof(long), .mode = 0600, .proc_handler = proc_dolongvec_minmax_bpf_restricted, .extra1 = SYSCTL_LONG_ONE, .extra2 = &bpf_jit_limit_max, }, #endif { .procname = "netdev_tstamp_prequeue", .data = &netdev_tstamp_prequeue, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "message_cost", .data = &net_ratelimit_state.interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "message_burst", .data = &net_ratelimit_state.burst, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "optmem_max", .data = &sysctl_optmem_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "tstamp_allow_data", .data = &sysctl_tstamp_allow_data, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, #ifdef CONFIG_RPS { .procname = "rps_sock_flow_entries", .maxlen = sizeof(int), .mode = 0644, .proc_handler = rps_sock_flow_sysctl }, #endif #ifdef CONFIG_NET_FLOW_LIMIT { .procname = "flow_limit_cpu_bitmap", .mode = 0644, .proc_handler = flow_limit_cpu_sysctl }, { .procname = "flow_limit_table_len", .data = &netdev_flow_limit_table_len, .maxlen = sizeof(int), .mode = 0644, .proc_handler = flow_limit_table_len_sysctl }, #endif /* CONFIG_NET_FLOW_LIMIT */ #ifdef CONFIG_NET_RX_BUSY_POLL { .procname = "busy_poll", .data = &sysctl_net_busy_poll, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { .procname = "busy_read", .data = &sysctl_net_busy_read, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, #endif #ifdef CONFIG_NET_SCHED { .procname = "default_qdisc", .mode = 0644, .maxlen = IFNAMSIZ, .proc_handler = set_default_qdisc }, #endif { .procname = "netdev_budget", .data = &netdev_budget, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "warnings", .data = &net_msg_warn, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "max_skb_frags", .data = &sysctl_max_skb_frags, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = &max_skb_frags, }, { .procname = "netdev_budget_usecs", .data = &netdev_budget_usecs, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { .procname = "fb_tunnels_only_for_init_net", .data = &sysctl_fb_tunnels_only_for_init_net, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "devconf_inherit_init_net", .data = &sysctl_devconf_inherit_init_net, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_THREE, }, { .procname = "high_order_alloc_disable", .data = &net_high_order_alloc_disable_key.key, .maxlen = sizeof(net_high_order_alloc_disable_key), .mode = 0644, .proc_handler = proc_do_static_key, }, { .procname = "gro_normal_batch", .data = &gro_normal_batch, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, { .procname = "netdev_unregister_timeout_secs", .data = &netdev_unregister_timeout_secs, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = &int_3600, }, { .procname = "skb_defer_max", .data = &sysctl_skb_defer_max, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { } }; static struct ctl_table netns_core_table[] = { { .procname = "somaxconn", .data = &init_net.core.sysctl_somaxconn, .maxlen = sizeof(int), .mode = 0644, .extra1 = SYSCTL_ZERO, .proc_handler = proc_dointvec_minmax }, { .procname = "txrehash", .data = &init_net.core.sysctl_txrehash, .maxlen = sizeof(u8), .mode = 0644, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, .proc_handler = proc_dou8vec_minmax, }, { } }; static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str) { /* fallback tunnels for initns only */ if (!strncmp(str, "initns", 6)) sysctl_fb_tunnels_only_for_init_net = 1; /* no fallback tunnels anywhere */ else if (!strncmp(str, "none", 4)) sysctl_fb_tunnels_only_for_init_net = 2; return 1; } __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup); static __net_init int sysctl_core_net_init(struct net *net) { struct ctl_table *tbl, *tmp; tbl = netns_core_table; if (!net_eq(net, &init_net)) { tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); if (tbl == NULL) goto err_dup; for (tmp = tbl; tmp->procname; tmp++) tmp->data += (char *)net - (char *)&init_net; /* Don't export any sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) { tbl[0].procname = NULL; } } net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl); if (net->core.sysctl_hdr == NULL) goto err_reg; return 0; err_reg: if (tbl != netns_core_table) kfree(tbl); err_dup: return -ENOMEM; } static __net_exit void sysctl_core_net_exit(struct net *net) { struct ctl_table *tbl; tbl = net->core.sysctl_hdr->ctl_table_arg; unregister_net_sysctl_table(net->core.sysctl_hdr); BUG_ON(tbl == netns_core_table); kfree(tbl); } static __net_initdata struct pernet_operations sysctl_core_ops = { .init = sysctl_core_net_init, .exit = sysctl_core_net_exit, }; static __init int sysctl_core_init(void) { register_net_sysctl(&init_net, "net/core", net_core_table); return register_pernet_subsys(&sysctl_core_ops); } fs_initcall(sysctl_core_init); |
78 24 17 67 3 65 55 15 26 38 14 42 1 41 19 23 46 1 8 17 19 14 19 20 17 18 23 23 23 47 47 47 1 47 12 43 43 23 23 23 43 43 44 7 9 29 2 18 37 33 16 33 1 32 1 32 2 14 4 12 8 3 1 16 136 207 26 28 43 178 26 16 11 87 78 142 142 142 142 141 18 2 16 18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. */ #include <linux/init.h> #include <linux/buffer_head.h> #include <linux/mpage.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/time.h> #include <linux/writeback.h> #include <linux/uio.h> #include <linux/random.h> #include <linux/iversion.h> #include "exfat_raw.h" #include "exfat_fs.h" int __exfat_write_inode(struct inode *inode, int sync) { unsigned long long on_disk_size; struct exfat_dentry *ep, *ep2; struct exfat_entry_set_cache *es = NULL; struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(inode); bool is_dir = (ei->type == TYPE_DIR) ? true : false; if (inode->i_ino == EXFAT_ROOT_INO) return 0; /* * If the inode is already unlinked, there is no need for updating it. */ if (ei->dir.dir == DIR_DELETED) return 0; if (is_dir && ei->dir.dir == sbi->root_dir && ei->entry == -1) return 0; exfat_set_volume_dirty(sb); /* get the directory entry of given file or directory */ es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry, ES_ALL_ENTRIES); if (!es) return -EIO; ep = exfat_get_dentry_cached(es, 0); ep2 = exfat_get_dentry_cached(es, 1); ep->dentry.file.attr = cpu_to_le16(exfat_make_attr(inode)); /* set FILE_INFO structure using the acquired struct exfat_dentry */ exfat_set_entry_time(sbi, &ei->i_crtime, &ep->dentry.file.create_tz, &ep->dentry.file.create_time, &ep->dentry.file.create_date, &ep->dentry.file.create_time_cs); exfat_set_entry_time(sbi, &inode->i_mtime, &ep->dentry.file.modify_tz, &ep->dentry.file.modify_time, &ep->dentry.file.modify_date, &ep->dentry.file.modify_time_cs); exfat_set_entry_time(sbi, &inode->i_atime, &ep->dentry.file.access_tz, &ep->dentry.file.access_time, &ep->dentry.file.access_date, NULL); /* File size should be zero if there is no cluster allocated */ on_disk_size = i_size_read(inode); if (ei->start_clu == EXFAT_EOF_CLUSTER) on_disk_size = 0; ep2->dentry.stream.valid_size = cpu_to_le64(on_disk_size); ep2->dentry.stream.size = ep2->dentry.stream.valid_size; if (on_disk_size) { ep2->dentry.stream.flags = ei->flags; ep2->dentry.stream.start_clu = cpu_to_le32(ei->start_clu); } else { ep2->dentry.stream.flags = ALLOC_FAT_CHAIN; ep2->dentry.stream.start_clu = EXFAT_FREE_CLUSTER; } exfat_update_dir_chksum_with_entry_set(es); return exfat_free_dentry_set(es, sync); } int exfat_write_inode(struct inode *inode, struct writeback_control *wbc) { int ret; mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock); ret = __exfat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); mutex_unlock(&EXFAT_SB(inode->i_sb)->s_lock); return ret; } void exfat_sync_inode(struct inode *inode) { lockdep_assert_held(&EXFAT_SB(inode->i_sb)->s_lock); __exfat_write_inode(inode, 1); } /* * Input: inode, (logical) clu_offset, target allocation area * Output: errcode, cluster number * *clu = (~0), if it's unable to allocate a new cluster */ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset, unsigned int *clu, int create) { int ret; unsigned int last_clu; struct exfat_chain new_clu; struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(inode); unsigned int local_clu_offset = clu_offset; unsigned int num_to_be_allocated = 0, num_clusters = 0; if (ei->i_size_ondisk > 0) num_clusters = EXFAT_B_TO_CLU_ROUND_UP(ei->i_size_ondisk, sbi); if (clu_offset >= num_clusters) num_to_be_allocated = clu_offset - num_clusters + 1; if (!create && (num_to_be_allocated > 0)) { *clu = EXFAT_EOF_CLUSTER; return 0; } *clu = last_clu = ei->start_clu; if (ei->flags == ALLOC_NO_FAT_CHAIN) { if (clu_offset > 0 && *clu != EXFAT_EOF_CLUSTER) { last_clu += clu_offset - 1; if (clu_offset == num_clusters) *clu = EXFAT_EOF_CLUSTER; else *clu += clu_offset; } } else if (ei->type == TYPE_FILE) { unsigned int fclus = 0; int err = exfat_get_cluster(inode, clu_offset, &fclus, clu, &last_clu, 1); if (err) return -EIO; clu_offset -= fclus; } else { /* hint information */ if (clu_offset > 0 && ei->hint_bmap.off != EXFAT_EOF_CLUSTER && ei->hint_bmap.off > 0 && clu_offset >= ei->hint_bmap.off) { clu_offset -= ei->hint_bmap.off; /* hint_bmap.clu should be valid */ WARN_ON(ei->hint_bmap.clu < 2); *clu = ei->hint_bmap.clu; } while (clu_offset > 0 && *clu != EXFAT_EOF_CLUSTER) { last_clu = *clu; if (exfat_get_next_cluster(sb, clu)) return -EIO; clu_offset--; } } if (*clu == EXFAT_EOF_CLUSTER) { exfat_set_volume_dirty(sb); new_clu.dir = (last_clu == EXFAT_EOF_CLUSTER) ? EXFAT_EOF_CLUSTER : last_clu + 1; new_clu.size = 0; new_clu.flags = ei->flags; /* allocate a cluster */ if (num_to_be_allocated < 1) { /* Broken FAT (i_sze > allocated FAT) */ exfat_fs_error(sb, "broken FAT chain."); return -EIO; } ret = exfat_alloc_cluster(inode, num_to_be_allocated, &new_clu, inode_needs_sync(inode)); if (ret) return ret; if (new_clu.dir == EXFAT_EOF_CLUSTER || new_clu.dir == EXFAT_FREE_CLUSTER) { exfat_fs_error(sb, "bogus cluster new allocated (last_clu : %u, new_clu : %u)", last_clu, new_clu.dir); return -EIO; } /* append to the FAT chain */ if (last_clu == EXFAT_EOF_CLUSTER) { if (new_clu.flags == ALLOC_FAT_CHAIN) ei->flags = ALLOC_FAT_CHAIN; ei->start_clu = new_clu.dir; } else { if (new_clu.flags != ei->flags) { /* no-fat-chain bit is disabled, * so fat-chain should be synced with * alloc-bitmap */ exfat_chain_cont_cluster(sb, ei->start_clu, num_clusters); ei->flags = ALLOC_FAT_CHAIN; } if (new_clu.flags == ALLOC_FAT_CHAIN) if (exfat_ent_set(sb, last_clu, new_clu.dir)) return -EIO; } num_clusters += num_to_be_allocated; *clu = new_clu.dir; inode->i_blocks += EXFAT_CLU_TO_B(num_to_be_allocated, sbi) >> 9; /* * Move *clu pointer along FAT chains (hole care) because the * caller of this function expect *clu to be the last cluster. * This only works when num_to_be_allocated >= 2, * *clu = (the first cluster of the allocated chain) => * (the last cluster of ...) */ if (ei->flags == ALLOC_NO_FAT_CHAIN) { *clu += num_to_be_allocated - 1; } else { while (num_to_be_allocated > 1) { if (exfat_get_next_cluster(sb, clu)) return -EIO; num_to_be_allocated--; } } } /* hint information */ ei->hint_bmap.off = local_clu_offset; ei->hint_bmap.clu = *clu; return 0; } static int exfat_map_new_buffer(struct exfat_inode_info *ei, struct buffer_head *bh, loff_t pos) { if (buffer_delay(bh) && pos > ei->i_size_aligned) return -EIO; set_buffer_new(bh); /* * Adjust i_size_aligned if i_size_ondisk is bigger than it. */ if (ei->i_size_ondisk > ei->i_size_aligned) ei->i_size_aligned = ei->i_size_ondisk; return 0; } static int exfat_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { struct exfat_inode_info *ei = EXFAT_I(inode); struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; int err = 0; unsigned long mapped_blocks = 0; unsigned int cluster, sec_offset; sector_t last_block; sector_t phys = 0; loff_t pos; mutex_lock(&sbi->s_lock); last_block = EXFAT_B_TO_BLK_ROUND_UP(i_size_read(inode), sb); if (iblock >= last_block && !create) goto done; /* Is this block already allocated? */ err = exfat_map_cluster(inode, iblock >> sbi->sect_per_clus_bits, &cluster, create); if (err) { if (err != -ENOSPC) exfat_fs_error_ratelimit(sb, "failed to bmap (inode : %p iblock : %llu, err : %d)", inode, (unsigned long long)iblock, err); goto unlock_ret; } if (cluster == EXFAT_EOF_CLUSTER) goto done; /* sector offset in cluster */ sec_offset = iblock & (sbi->sect_per_clus - 1); phys = exfat_cluster_to_sector(sbi, cluster) + sec_offset; mapped_blocks = sbi->sect_per_clus - sec_offset; max_blocks = min(mapped_blocks, max_blocks); /* Treat newly added block / cluster */ if (iblock < last_block) create = 0; if (create || buffer_delay(bh_result)) { pos = EXFAT_BLK_TO_B((iblock + 1), sb); if (ei->i_size_ondisk < pos) ei->i_size_ondisk = pos; } if (create) { err = exfat_map_new_buffer(ei, bh_result, pos); if (err) { exfat_fs_error(sb, "requested for bmap out of range(pos : (%llu) > i_size_aligned(%llu)\n", pos, ei->i_size_aligned); goto unlock_ret; } } if (buffer_delay(bh_result)) clear_buffer_delay(bh_result); map_bh(bh_result, sb, phys); done: bh_result->b_size = EXFAT_BLK_TO_B(max_blocks, sb); unlock_ret: mutex_unlock(&sbi->s_lock); return err; } static int exfat_read_folio(struct file *file, struct folio *folio) { return mpage_read_folio(folio, exfat_get_block); } static void exfat_readahead(struct readahead_control *rac) { mpage_readahead(rac, exfat_get_block); } static int exfat_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page, exfat_get_block, wbc); } static int exfat_writepages(struct address_space *mapping, struct writeback_control *wbc) { return mpage_writepages(mapping, wbc, exfat_get_block); } static void exfat_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; if (to > i_size_read(inode)) { truncate_pagecache(inode, i_size_read(inode)); inode->i_mtime = inode->i_ctime = current_time(inode); exfat_truncate(inode, EXFAT_I(inode)->i_size_aligned); } } static int exfat_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned int len, struct page **pagep, void **fsdata) { int ret; *pagep = NULL; ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata, exfat_get_block, &EXFAT_I(mapping->host)->i_size_ondisk); if (ret < 0) exfat_write_failed(mapping, pos+len); return ret; } static int exfat_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned int len, unsigned int copied, struct page *pagep, void *fsdata) { struct inode *inode = mapping->host; struct exfat_inode_info *ei = EXFAT_I(inode); int err; err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); if (ei->i_size_aligned < i_size_read(inode)) { exfat_fs_error(inode->i_sb, "invalid size(size(%llu) > aligned(%llu)\n", i_size_read(inode), ei->i_size_aligned); return -EIO; } if (err < len) exfat_write_failed(mapping, pos+len); if (!(err < 0) && !(ei->attr & ATTR_ARCHIVE)) { inode->i_mtime = inode->i_ctime = current_time(inode); ei->attr |= ATTR_ARCHIVE; mark_inode_dirty(inode); } return err; } static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; loff_t size = iocb->ki_pos + iov_iter_count(iter); int rw = iov_iter_rw(iter); ssize_t ret; if (rw == WRITE) { /* * FIXME: blockdev_direct_IO() doesn't use ->write_begin(), * so we need to update the ->i_size_aligned to block boundary. * * But we must fill the remaining area or hole by nul for * updating ->i_size_aligned * * Return 0, and fallback to normal buffered write. */ if (EXFAT_I(inode)->i_size_aligned < size) return 0; } /* * Need to use the DIO_LOCKING for avoiding the race * condition of exfat_get_block() and ->truncate(). */ ret = blockdev_direct_IO(iocb, inode, iter, exfat_get_block); if (ret < 0 && (rw & WRITE)) exfat_write_failed(mapping, size); return ret; } static sector_t exfat_aop_bmap(struct address_space *mapping, sector_t block) { sector_t blocknr; /* exfat_get_cluster() assumes the requested blocknr isn't truncated. */ down_read(&EXFAT_I(mapping->host)->truncate_lock); blocknr = generic_block_bmap(mapping, block, exfat_get_block); up_read(&EXFAT_I(mapping->host)->truncate_lock); return blocknr; } /* * exfat_block_truncate_page() zeroes out a mapping from file offset `from' * up to the end of the block which corresponds to `from'. * This is required during truncate to physically zeroout the tail end * of that block so it doesn't yield old data if the file is later grown. * Also, avoid causing failure from fsx for cases of "data past EOF" */ int exfat_block_truncate_page(struct inode *inode, loff_t from) { return block_truncate_page(inode->i_mapping, from, exfat_get_block); } static const struct address_space_operations exfat_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = exfat_read_folio, .readahead = exfat_readahead, .writepage = exfat_writepage, .writepages = exfat_writepages, .write_begin = exfat_write_begin, .write_end = exfat_write_end, .direct_IO = exfat_direct_IO, .bmap = exfat_aop_bmap }; static inline unsigned long exfat_hash(loff_t i_pos) { return hash_32(i_pos, EXFAT_HASH_BITS); } void exfat_hash_inode(struct inode *inode, loff_t i_pos) { struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb); struct hlist_head *head = sbi->inode_hashtable + exfat_hash(i_pos); spin_lock(&sbi->inode_hash_lock); EXFAT_I(inode)->i_pos = i_pos; hlist_add_head(&EXFAT_I(inode)->i_hash_fat, head); spin_unlock(&sbi->inode_hash_lock); } void exfat_unhash_inode(struct inode *inode) { struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb); spin_lock(&sbi->inode_hash_lock); hlist_del_init(&EXFAT_I(inode)->i_hash_fat); EXFAT_I(inode)->i_pos = 0; spin_unlock(&sbi->inode_hash_lock); } struct inode *exfat_iget(struct super_block *sb, loff_t i_pos) { struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *info; struct hlist_head *head = sbi->inode_hashtable + exfat_hash(i_pos); struct inode *inode = NULL; spin_lock(&sbi->inode_hash_lock); hlist_for_each_entry(info, head, i_hash_fat) { WARN_ON(info->vfs_inode.i_sb != sb); if (i_pos != info->i_pos) continue; inode = igrab(&info->vfs_inode); if (inode) break; } spin_unlock(&sbi->inode_hash_lock); return inode; } /* doesn't deal with root inode */ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) { struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb); struct exfat_inode_info *ei = EXFAT_I(inode); loff_t size = info->size; ei->dir = info->dir; ei->entry = info->entry; ei->attr = info->attr; ei->start_clu = info->start_clu; ei->flags = info->flags; ei->type = info->type; ei->version = 0; ei->hint_stat.eidx = 0; ei->hint_stat.clu = info->start_clu; ei->hint_femp.eidx = EXFAT_HINT_NONE; ei->hint_bmap.off = EXFAT_EOF_CLUSTER; ei->i_pos = 0; inode->i_uid = sbi->options.fs_uid; inode->i_gid = sbi->options.fs_gid; inode_inc_iversion(inode); inode->i_generation = get_random_u32(); if (info->attr & ATTR_SUBDIR) { /* directory */ inode->i_generation &= ~1; inode->i_mode = exfat_make_mode(sbi, info->attr, 0777); inode->i_op = &exfat_dir_inode_operations; inode->i_fop = &exfat_dir_operations; set_nlink(inode, info->num_subdirs); } else { /* regular file */ inode->i_generation |= 1; inode->i_mode = exfat_make_mode(sbi, info->attr, 0777); inode->i_op = &exfat_file_inode_operations; inode->i_fop = &exfat_file_operations; inode->i_mapping->a_ops = &exfat_aops; inode->i_mapping->nrpages = 0; } i_size_write(inode, size); /* ondisk and aligned size should be aligned with block size */ if (size & (inode->i_sb->s_blocksize - 1)) { size |= (inode->i_sb->s_blocksize - 1); size++; } ei->i_size_aligned = size; ei->i_size_ondisk = size; exfat_save_attr(inode, info->attr); inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> 9; inode->i_mtime = info->mtime; inode->i_ctime = info->mtime; ei->i_crtime = info->crtime; inode->i_atime = info->atime; return 0; } struct inode *exfat_build_inode(struct super_block *sb, struct exfat_dir_entry *info, loff_t i_pos) { struct inode *inode; int err; inode = exfat_iget(sb, i_pos); if (inode) goto out; inode = new_inode(sb); if (!inode) { inode = ERR_PTR(-ENOMEM); goto out; } inode->i_ino = iunique(sb, EXFAT_ROOT_INO); inode_set_iversion(inode, 1); err = exfat_fill_inode(inode, info); if (err) { iput(inode); inode = ERR_PTR(err); goto out; } exfat_hash_inode(inode, i_pos); insert_inode_hash(inode); out: return inode; } void exfat_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); if (!inode->i_nlink) { i_size_write(inode, 0); mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock); __exfat_truncate(inode, 0); mutex_unlock(&EXFAT_SB(inode->i_sb)->s_lock); } invalidate_inode_buffers(inode); clear_inode(inode); exfat_cache_inval_inode(inode); exfat_unhash_inode(inode); } |
1 8 8 1 8 8 6 6 6 6 6 2 2 11 1 10 7 1 6 1 6 3 3 10 11 1 10 10 16 17 33 33 32 2 31 1 1 33 1 1 43 35 1 1 1 1 2 1 1 1 4 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 | // SPDX-License-Identifier: GPL-2.0-or-later /* * * Bluetooth HCI UART driver * * Copyright (C) 2000-2001 Qualcomm Incorporated * Copyright (C) 2002-2003 Maxim Krasnyansky <maxk@qualcomm.com> * Copyright (C) 2004-2005 Marcel Holtmann <marcel@holtmann.org> */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/interrupt.h> #include <linux/ptrace.h> #include <linux/poll.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/signal.h> #include <linux/ioctl.h> #include <linux/skbuff.h> #include <linux/firmware.h> #include <linux/serdev.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include "btintel.h" #include "btbcm.h" #include "hci_uart.h" #define VERSION "2.3" static const struct hci_uart_proto *hup[HCI_UART_MAX_PROTO]; int hci_uart_register_proto(const struct hci_uart_proto *p) { if (p->id >= HCI_UART_MAX_PROTO) return -EINVAL; if (hup[p->id]) return -EEXIST; hup[p->id] = p; BT_INFO("HCI UART protocol %s registered", p->name); return 0; } int hci_uart_unregister_proto(const struct hci_uart_proto *p) { if (p->id >= HCI_UART_MAX_PROTO) return -EINVAL; if (!hup[p->id]) return -EINVAL; hup[p->id] = NULL; return 0; } static const struct hci_uart_proto *hci_uart_get_proto(unsigned int id) { if (id >= HCI_UART_MAX_PROTO) return NULL; return hup[id]; } static inline void hci_uart_tx_complete(struct hci_uart *hu, int pkt_type) { struct hci_dev *hdev = hu->hdev; /* Update HCI stat counters */ switch (pkt_type) { case HCI_COMMAND_PKT: hdev->stat.cmd_tx++; break; case HCI_ACLDATA_PKT: hdev->stat.acl_tx++; break; case HCI_SCODATA_PKT: hdev->stat.sco_tx++; break; } } static inline struct sk_buff *hci_uart_dequeue(struct hci_uart *hu) { struct sk_buff *skb = hu->tx_skb; if (!skb) { percpu_down_read(&hu->proto_lock); if (test_bit(HCI_UART_PROTO_READY, &hu->flags)) skb = hu->proto->dequeue(hu); percpu_up_read(&hu->proto_lock); } else { hu->tx_skb = NULL; } return skb; } int hci_uart_tx_wakeup(struct hci_uart *hu) { /* This may be called in an IRQ context, so we can't sleep. Therefore * we try to acquire the lock only, and if that fails we assume the * tty is being closed because that is the only time the write lock is * acquired. If, however, at some point in the future the write lock * is also acquired in other situations, then this must be revisited. */ if (!percpu_down_read_trylock(&hu->proto_lock)) return 0; if (!test_bit(HCI_UART_PROTO_READY, &hu->flags)) goto no_schedule; set_bit(HCI_UART_TX_WAKEUP, &hu->tx_state); if (test_and_set_bit(HCI_UART_SENDING, &hu->tx_state)) goto no_schedule; BT_DBG(""); schedule_work(&hu->write_work); no_schedule: percpu_up_read(&hu->proto_lock); return 0; } EXPORT_SYMBOL_GPL(hci_uart_tx_wakeup); static void hci_uart_write_work(struct work_struct *work) { struct hci_uart *hu = container_of(work, struct hci_uart, write_work); struct tty_struct *tty = hu->tty; struct hci_dev *hdev = hu->hdev; struct sk_buff *skb; /* REVISIT: should we cope with bad skbs or ->write() returning * and error value ? */ restart: clear_bit(HCI_UART_TX_WAKEUP, &hu->tx_state); while ((skb = hci_uart_dequeue(hu))) { int len; set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); len = tty->ops->write(tty, skb->data, skb->len); hdev->stat.byte_tx += len; skb_pull(skb, len); if (skb->len) { hu->tx_skb = skb; break; } hci_uart_tx_complete(hu, hci_skb_pkt_type(skb)); kfree_skb(skb); } clear_bit(HCI_UART_SENDING, &hu->tx_state); if (test_bit(HCI_UART_TX_WAKEUP, &hu->tx_state)) goto restart; wake_up_bit(&hu->tx_state, HCI_UART_SENDING); } void hci_uart_init_work(struct work_struct *work) { struct hci_uart *hu = container_of(work, struct hci_uart, init_ready); int err; struct hci_dev *hdev; if (!test_and_clear_bit(HCI_UART_INIT_PENDING, &hu->hdev_flags)) return; err = hci_register_dev(hu->hdev); if (err < 0) { BT_ERR("Can't register HCI device"); clear_bit(HCI_UART_PROTO_READY, &hu->flags); hu->proto->close(hu); hdev = hu->hdev; hu->hdev = NULL; hci_free_dev(hdev); return; } set_bit(HCI_UART_REGISTERED, &hu->flags); } int hci_uart_init_ready(struct hci_uart *hu) { if (!test_bit(HCI_UART_INIT_PENDING, &hu->hdev_flags)) return -EALREADY; schedule_work(&hu->init_ready); return 0; } int hci_uart_wait_until_sent(struct hci_uart *hu) { return wait_on_bit_timeout(&hu->tx_state, HCI_UART_SENDING, TASK_INTERRUPTIBLE, msecs_to_jiffies(2000)); } /* ------- Interface to HCI layer ------ */ /* Reset device */ static int hci_uart_flush(struct hci_dev *hdev) { struct hci_uart *hu = hci_get_drvdata(hdev); struct tty_struct *tty = hu->tty; BT_DBG("hdev %p tty %p", hdev, tty); if (hu->tx_skb) { kfree_skb(hu->tx_skb); hu->tx_skb = NULL; } /* Flush any pending characters in the driver and discipline. */ tty_ldisc_flush(tty); tty_driver_flush_buffer(tty); percpu_down_read(&hu->proto_lock); if (test_bit(HCI_UART_PROTO_READY, &hu->flags)) hu->proto->flush(hu); percpu_up_read(&hu->proto_lock); return 0; } /* Initialize device */ static int hci_uart_open(struct hci_dev *hdev) { BT_DBG("%s %p", hdev->name, hdev); /* Undo clearing this from hci_uart_close() */ hdev->flush = hci_uart_flush; return 0; } /* Close device */ static int hci_uart_close(struct hci_dev *hdev) { BT_DBG("hdev %p", hdev); hci_uart_flush(hdev); hdev->flush = NULL; return 0; } /* Send frames from HCI layer */ static int hci_uart_send_frame(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_uart *hu = hci_get_drvdata(hdev); BT_DBG("%s: type %d len %d", hdev->name, hci_skb_pkt_type(skb), skb->len); percpu_down_read(&hu->proto_lock); if (!test_bit(HCI_UART_PROTO_READY, &hu->flags)) { percpu_up_read(&hu->proto_lock); return -EUNATCH; } hu->proto->enqueue(hu, skb); percpu_up_read(&hu->proto_lock); hci_uart_tx_wakeup(hu); return 0; } /* Check the underlying device or tty has flow control support */ bool hci_uart_has_flow_control(struct hci_uart *hu) { /* serdev nodes check if the needed operations are present */ if (hu->serdev) return true; if (hu->tty->driver->ops->tiocmget && hu->tty->driver->ops->tiocmset) return true; return false; } /* Flow control or un-flow control the device */ void hci_uart_set_flow_control(struct hci_uart *hu, bool enable) { struct tty_struct *tty = hu->tty; struct ktermios ktermios; int status; unsigned int set = 0; unsigned int clear = 0; if (hu->serdev) { serdev_device_set_flow_control(hu->serdev, !enable); serdev_device_set_rts(hu->serdev, !enable); return; } if (enable) { /* Disable hardware flow control */ ktermios = tty->termios; ktermios.c_cflag &= ~CRTSCTS; status = tty_set_termios(tty, &ktermios); BT_DBG("Disabling hardware flow control: %s", status ? "failed" : "success"); /* Clear RTS to prevent the device from sending */ /* Most UARTs need OUT2 to enable interrupts */ status = tty->driver->ops->tiocmget(tty); BT_DBG("Current tiocm 0x%x", status); set &= ~(TIOCM_OUT2 | TIOCM_RTS); clear = ~set; set &= TIOCM_DTR | TIOCM_RTS | TIOCM_OUT1 | TIOCM_OUT2 | TIOCM_LOOP; clear &= TIOCM_DTR | TIOCM_RTS | TIOCM_OUT1 | TIOCM_OUT2 | TIOCM_LOOP; status = tty->driver->ops->tiocmset(tty, set, clear); BT_DBG("Clearing RTS: %s", status ? "failed" : "success"); } else { /* Set RTS to allow the device to send again */ status = tty->driver->ops->tiocmget(tty); BT_DBG("Current tiocm 0x%x", status); set |= (TIOCM_OUT2 | TIOCM_RTS); clear = ~set; set &= TIOCM_DTR | TIOCM_RTS | TIOCM_OUT1 | TIOCM_OUT2 | TIOCM_LOOP; clear &= TIOCM_DTR | TIOCM_RTS | TIOCM_OUT1 | TIOCM_OUT2 | TIOCM_LOOP; status = tty->driver->ops->tiocmset(tty, set, clear); BT_DBG("Setting RTS: %s", status ? "failed" : "success"); /* Re-enable hardware flow control */ ktermios = tty->termios; ktermios.c_cflag |= CRTSCTS; status = tty_set_termios(tty, &ktermios); BT_DBG("Enabling hardware flow control: %s", status ? "failed" : "success"); } } void hci_uart_set_speeds(struct hci_uart *hu, unsigned int init_speed, unsigned int oper_speed) { hu->init_speed = init_speed; hu->oper_speed = oper_speed; } void hci_uart_set_baudrate(struct hci_uart *hu, unsigned int speed) { struct tty_struct *tty = hu->tty; struct ktermios ktermios; ktermios = tty->termios; ktermios.c_cflag &= ~CBAUD; tty_termios_encode_baud_rate(&ktermios, speed, speed); /* tty_set_termios() return not checked as it is always 0 */ tty_set_termios(tty, &ktermios); BT_DBG("%s: New tty speeds: %d/%d", hu->hdev->name, tty->termios.c_ispeed, tty->termios.c_ospeed); } static int hci_uart_setup(struct hci_dev *hdev) { struct hci_uart *hu = hci_get_drvdata(hdev); struct hci_rp_read_local_version *ver; struct sk_buff *skb; unsigned int speed; int err; /* Init speed if any */ if (hu->init_speed) speed = hu->init_speed; else if (hu->proto->init_speed) speed = hu->proto->init_speed; else speed = 0; if (speed) hci_uart_set_baudrate(hu, speed); /* Operational speed if any */ if (hu->oper_speed) speed = hu->oper_speed; else if (hu->proto->oper_speed) speed = hu->proto->oper_speed; else speed = 0; if (hu->proto->set_baudrate && speed) { err = hu->proto->set_baudrate(hu, speed); if (!err) hci_uart_set_baudrate(hu, speed); } if (hu->proto->setup) return hu->proto->setup(hu); if (!test_bit(HCI_UART_VND_DETECT, &hu->hdev_flags)) return 0; skb = __hci_cmd_sync(hdev, HCI_OP_READ_LOCAL_VERSION, 0, NULL, HCI_INIT_TIMEOUT); if (IS_ERR(skb)) { BT_ERR("%s: Reading local version information failed (%ld)", hdev->name, PTR_ERR(skb)); return 0; } if (skb->len != sizeof(*ver)) { BT_ERR("%s: Event length mismatch for version information", hdev->name); goto done; } ver = (struct hci_rp_read_local_version *)skb->data; switch (le16_to_cpu(ver->manufacturer)) { #ifdef CONFIG_BT_HCIUART_INTEL case 2: hdev->set_bdaddr = btintel_set_bdaddr; btintel_check_bdaddr(hdev); break; #endif #ifdef CONFIG_BT_HCIUART_BCM case 15: hdev->set_bdaddr = btbcm_set_bdaddr; btbcm_check_bdaddr(hdev); break; #endif default: break; } done: kfree_skb(skb); return 0; } /* ------ LDISC part ------ */ /* hci_uart_tty_open * * Called when line discipline changed to HCI_UART. * * Arguments: * tty pointer to tty info structure * Return Value: * 0 if success, otherwise error code */ static int hci_uart_tty_open(struct tty_struct *tty) { struct hci_uart *hu; BT_DBG("tty %p", tty); if (!capable(CAP_NET_ADMIN)) return -EPERM; /* Error if the tty has no write op instead of leaving an exploitable * hole */ if (tty->ops->write == NULL) return -EOPNOTSUPP; hu = kzalloc(sizeof(struct hci_uart), GFP_KERNEL); if (!hu) { BT_ERR("Can't allocate control structure"); return -ENFILE; } if (percpu_init_rwsem(&hu->proto_lock)) { BT_ERR("Can't allocate semaphore structure"); kfree(hu); return -ENOMEM; } tty->disc_data = hu; hu->tty = tty; tty->receive_room = 65536; /* disable alignment support by default */ hu->alignment = 1; hu->padding = 0; INIT_WORK(&hu->init_ready, hci_uart_init_work); INIT_WORK(&hu->write_work, hci_uart_write_work); /* Flush any pending characters in the driver */ tty_driver_flush_buffer(tty); return 0; } /* hci_uart_tty_close() * * Called when the line discipline is changed to something * else, the tty is closed, or the tty detects a hangup. */ static void hci_uart_tty_close(struct tty_struct *tty) { struct hci_uart *hu = tty->disc_data; struct hci_dev *hdev; BT_DBG("tty %p", tty); /* Detach from the tty */ tty->disc_data = NULL; if (!hu) return; hdev = hu->hdev; if (hdev) hci_uart_close(hdev); if (test_bit(HCI_UART_PROTO_READY, &hu->flags)) { percpu_down_write(&hu->proto_lock); clear_bit(HCI_UART_PROTO_READY, &hu->flags); percpu_up_write(&hu->proto_lock); cancel_work_sync(&hu->init_ready); cancel_work_sync(&hu->write_work); if (hdev) { if (test_bit(HCI_UART_REGISTERED, &hu->flags)) hci_unregister_dev(hdev); hci_free_dev(hdev); } hu->proto->close(hu); } clear_bit(HCI_UART_PROTO_SET, &hu->flags); percpu_free_rwsem(&hu->proto_lock); kfree(hu); } /* hci_uart_tty_wakeup() * * Callback for transmit wakeup. Called when low level * device driver can accept more send data. * * Arguments: tty pointer to associated tty instance data * Return Value: None */ static void hci_uart_tty_wakeup(struct tty_struct *tty) { struct hci_uart *hu = tty->disc_data; BT_DBG(""); if (!hu) return; clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); if (tty != hu->tty) return; if (test_bit(HCI_UART_PROTO_READY, &hu->flags)) hci_uart_tx_wakeup(hu); } /* hci_uart_tty_receive() * * Called by tty low level driver when receive data is * available. * * Arguments: tty pointer to tty isntance data * data pointer to received data * flags pointer to flags for data * count count of received data in bytes * * Return Value: None */ static void hci_uart_tty_receive(struct tty_struct *tty, const u8 *data, const char *flags, int count) { struct hci_uart *hu = tty->disc_data; if (!hu || tty != hu->tty) return; percpu_down_read(&hu->proto_lock); if (!test_bit(HCI_UART_PROTO_READY, &hu->flags)) { percpu_up_read(&hu->proto_lock); return; } /* It does not need a lock here as it is already protected by a mutex in * tty caller */ hu->proto->recv(hu, data, count); percpu_up_read(&hu->proto_lock); if (hu->hdev) hu->hdev->stat.byte_rx += count; tty_unthrottle(tty); } static int hci_uart_register_dev(struct hci_uart *hu) { struct hci_dev *hdev; int err; BT_DBG(""); /* Initialize and register HCI device */ hdev = hci_alloc_dev(); if (!hdev) { BT_ERR("Can't allocate HCI device"); return -ENOMEM; } hu->hdev = hdev; hdev->bus = HCI_UART; hci_set_drvdata(hdev, hu); /* Only when vendor specific setup callback is provided, consider * the manufacturer information valid. This avoids filling in the * value for Ericsson when nothing is specified. */ if (hu->proto->setup) hdev->manufacturer = hu->proto->manufacturer; hdev->open = hci_uart_open; hdev->close = hci_uart_close; hdev->flush = hci_uart_flush; hdev->send = hci_uart_send_frame; hdev->setup = hci_uart_setup; SET_HCIDEV_DEV(hdev, hu->tty->dev); if (test_bit(HCI_UART_RAW_DEVICE, &hu->hdev_flags)) set_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks); if (test_bit(HCI_UART_EXT_CONFIG, &hu->hdev_flags)) set_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks); if (!test_bit(HCI_UART_RESET_ON_INIT, &hu->hdev_flags)) set_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks); if (test_bit(HCI_UART_CREATE_AMP, &hu->hdev_flags)) hdev->dev_type = HCI_AMP; else hdev->dev_type = HCI_PRIMARY; /* Only call open() for the protocol after hdev is fully initialized as * open() (or a timer/workqueue it starts) may attempt to reference it. */ err = hu->proto->open(hu); if (err) { hu->hdev = NULL; hci_free_dev(hdev); return err; } if (test_bit(HCI_UART_INIT_PENDING, &hu->hdev_flags)) return 0; if (hci_register_dev(hdev) < 0) { BT_ERR("Can't register HCI device"); hu->proto->close(hu); hu->hdev = NULL; hci_free_dev(hdev); return -ENODEV; } set_bit(HCI_UART_REGISTERED, &hu->flags); return 0; } static int hci_uart_set_proto(struct hci_uart *hu, int id) { const struct hci_uart_proto *p; int err; p = hci_uart_get_proto(id); if (!p) return -EPROTONOSUPPORT; hu->proto = p; err = hci_uart_register_dev(hu); if (err) { return err; } set_bit(HCI_UART_PROTO_READY, &hu->flags); return 0; } static int hci_uart_set_flags(struct hci_uart *hu, unsigned long flags) { unsigned long valid_flags = BIT(HCI_UART_RAW_DEVICE) | BIT(HCI_UART_RESET_ON_INIT) | BIT(HCI_UART_CREATE_AMP) | BIT(HCI_UART_INIT_PENDING) | BIT(HCI_UART_EXT_CONFIG) | BIT(HCI_UART_VND_DETECT); if (flags & ~valid_flags) return -EINVAL; hu->hdev_flags = flags; return 0; } /* hci_uart_tty_ioctl() * * Process IOCTL system call for the tty device. * * Arguments: * * tty pointer to tty instance data * cmd IOCTL command code * arg argument for IOCTL call (cmd dependent) * * Return Value: Command dependent */ static int hci_uart_tty_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { struct hci_uart *hu = tty->disc_data; int err = 0; BT_DBG(""); /* Verify the status of the device */ if (!hu) return -EBADF; switch (cmd) { case HCIUARTSETPROTO: if (!test_and_set_bit(HCI_UART_PROTO_SET, &hu->flags)) { err = hci_uart_set_proto(hu, arg); if (err) clear_bit(HCI_UART_PROTO_SET, &hu->flags); } else err = -EBUSY; break; case HCIUARTGETPROTO: if (test_bit(HCI_UART_PROTO_SET, &hu->flags) && test_bit(HCI_UART_PROTO_READY, &hu->flags)) err = hu->proto->id; else err = -EUNATCH; break; case HCIUARTGETDEVICE: if (test_bit(HCI_UART_REGISTERED, &hu->flags)) err = hu->hdev->id; else err = -EUNATCH; break; case HCIUARTSETFLAGS: if (test_bit(HCI_UART_PROTO_SET, &hu->flags)) err = -EBUSY; else err = hci_uart_set_flags(hu, arg); break; case HCIUARTGETFLAGS: err = hu->hdev_flags; break; default: err = n_tty_ioctl_helper(tty, cmd, arg); break; } return err; } /* * We don't provide read/write/poll interface for user space. */ static ssize_t hci_uart_tty_read(struct tty_struct *tty, struct file *file, unsigned char *buf, size_t nr, void **cookie, unsigned long offset) { return 0; } static ssize_t hci_uart_tty_write(struct tty_struct *tty, struct file *file, const unsigned char *data, size_t count) { return 0; } static __poll_t hci_uart_tty_poll(struct tty_struct *tty, struct file *filp, poll_table *wait) { return 0; } static struct tty_ldisc_ops hci_uart_ldisc = { .owner = THIS_MODULE, .num = N_HCI, .name = "n_hci", .open = hci_uart_tty_open, .close = hci_uart_tty_close, .read = hci_uart_tty_read, .write = hci_uart_tty_write, .ioctl = hci_uart_tty_ioctl, .compat_ioctl = hci_uart_tty_ioctl, .poll = hci_uart_tty_poll, .receive_buf = hci_uart_tty_receive, .write_wakeup = hci_uart_tty_wakeup, }; static int __init hci_uart_init(void) { int err; BT_INFO("HCI UART driver ver %s", VERSION); /* Register the tty discipline */ err = tty_register_ldisc(&hci_uart_ldisc); if (err) { BT_ERR("HCI line discipline registration failed. (%d)", err); return err; } #ifdef CONFIG_BT_HCIUART_H4 h4_init(); #endif #ifdef CONFIG_BT_HCIUART_BCSP bcsp_init(); #endif #ifdef CONFIG_BT_HCIUART_LL ll_init(); #endif #ifdef CONFIG_BT_HCIUART_ATH3K ath_init(); #endif #ifdef CONFIG_BT_HCIUART_3WIRE h5_init(); #endif #ifdef CONFIG_BT_HCIUART_INTEL intel_init(); #endif #ifdef CONFIG_BT_HCIUART_BCM bcm_init(); #endif #ifdef CONFIG_BT_HCIUART_QCA qca_init(); #endif #ifdef CONFIG_BT_HCIUART_AG6XX ag6xx_init(); #endif #ifdef CONFIG_BT_HCIUART_MRVL mrvl_init(); #endif return 0; } static void __exit hci_uart_exit(void) { #ifdef CONFIG_BT_HCIUART_H4 h4_deinit(); #endif #ifdef CONFIG_BT_HCIUART_BCSP bcsp_deinit(); #endif #ifdef CONFIG_BT_HCIUART_LL ll_deinit(); #endif #ifdef CONFIG_BT_HCIUART_ATH3K ath_deinit(); #endif #ifdef CONFIG_BT_HCIUART_3WIRE h5_deinit(); #endif #ifdef CONFIG_BT_HCIUART_INTEL intel_deinit(); #endif #ifdef CONFIG_BT_HCIUART_BCM bcm_deinit(); #endif #ifdef CONFIG_BT_HCIUART_QCA qca_deinit(); #endif #ifdef CONFIG_BT_HCIUART_AG6XX ag6xx_deinit(); #endif #ifdef CONFIG_BT_HCIUART_MRVL mrvl_deinit(); #endif tty_unregister_ldisc(&hci_uart_ldisc); } module_init(hci_uart_init); module_exit(hci_uart_exit); MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>"); MODULE_DESCRIPTION("Bluetooth HCI UART driver ver " VERSION); MODULE_VERSION(VERSION); MODULE_LICENSE("GPL"); MODULE_ALIAS_LDISC(N_HCI); |
13 550 326 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM sock #if !defined(_TRACE_SOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SOCK_H #include <net/sock.h> #include <net/ipv6.h> #include <linux/tracepoint.h> #include <linux/ipv6.h> #include <linux/tcp.h> #define family_names \ EM(AF_INET) \ EMe(AF_INET6) /* The protocol traced by inet_sock_set_state */ #define inet_protocol_names \ EM(IPPROTO_TCP) \ EM(IPPROTO_DCCP) \ EM(IPPROTO_SCTP) \ EMe(IPPROTO_MPTCP) #define tcp_state_names \ EM(TCP_ESTABLISHED) \ EM(TCP_SYN_SENT) \ EM(TCP_SYN_RECV) \ EM(TCP_FIN_WAIT1) \ EM(TCP_FIN_WAIT2) \ EM(TCP_TIME_WAIT) \ EM(TCP_CLOSE) \ EM(TCP_CLOSE_WAIT) \ EM(TCP_LAST_ACK) \ EM(TCP_LISTEN) \ EM(TCP_CLOSING) \ EMe(TCP_NEW_SYN_RECV) #define skmem_kind_names \ EM(SK_MEM_SEND) \ EMe(SK_MEM_RECV) /* enums need to be exported to user space */ #undef EM #undef EMe #define EM(a) TRACE_DEFINE_ENUM(a); #define EMe(a) TRACE_DEFINE_ENUM(a); family_names inet_protocol_names tcp_state_names skmem_kind_names #undef EM #undef EMe #define EM(a) { a, #a }, #define EMe(a) { a, #a } #define show_family_name(val) \ __print_symbolic(val, family_names) #define show_inet_protocol_name(val) \ __print_symbolic(val, inet_protocol_names) #define show_tcp_state_name(val) \ __print_symbolic(val, tcp_state_names) #define show_skmem_kind_names(val) \ __print_symbolic(val, skmem_kind_names) TRACE_EVENT(sock_rcvqueue_full, TP_PROTO(struct sock *sk, struct sk_buff *skb), TP_ARGS(sk, skb), TP_STRUCT__entry( __field(int, rmem_alloc) __field(unsigned int, truesize) __field(int, sk_rcvbuf) ), TP_fast_assign( __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); __entry->truesize = skb->truesize; __entry->sk_rcvbuf = READ_ONCE(sk->sk_rcvbuf); ), TP_printk("rmem_alloc=%d truesize=%u sk_rcvbuf=%d", __entry->rmem_alloc, __entry->truesize, __entry->sk_rcvbuf) ); TRACE_EVENT(sock_exceed_buf_limit, TP_PROTO(struct sock *sk, struct proto *prot, long allocated, int kind), TP_ARGS(sk, prot, allocated, kind), TP_STRUCT__entry( __array(char, name, 32) __array(long, sysctl_mem, 3) __field(long, allocated) __field(int, sysctl_rmem) __field(int, rmem_alloc) __field(int, sysctl_wmem) __field(int, wmem_alloc) __field(int, wmem_queued) __field(int, kind) ), TP_fast_assign( strncpy(__entry->name, prot->name, 32); __entry->sysctl_mem[0] = READ_ONCE(prot->sysctl_mem[0]); __entry->sysctl_mem[1] = READ_ONCE(prot->sysctl_mem[1]); __entry->sysctl_mem[2] = READ_ONCE(prot->sysctl_mem[2]); __entry->allocated = allocated; __entry->sysctl_rmem = sk_get_rmem0(sk, prot); __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); __entry->sysctl_wmem = sk_get_wmem0(sk, prot); __entry->wmem_alloc = refcount_read(&sk->sk_wmem_alloc); __entry->wmem_queued = READ_ONCE(sk->sk_wmem_queued); __entry->kind = kind; ), TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld sysctl_rmem=%d rmem_alloc=%d sysctl_wmem=%d wmem_alloc=%d wmem_queued=%d kind=%s", __entry->name, __entry->sysctl_mem[0], __entry->sysctl_mem[1], __entry->sysctl_mem[2], __entry->allocated, __entry->sysctl_rmem, __entry->rmem_alloc, __entry->sysctl_wmem, __entry->wmem_alloc, __entry->wmem_queued, show_skmem_kind_names(__entry->kind) ) ); TRACE_EVENT(inet_sock_set_state, TP_PROTO(const struct sock *sk, const int oldstate, const int newstate), TP_ARGS(sk, oldstate, newstate), TP_STRUCT__entry( __field(const void *, skaddr) __field(int, oldstate) __field(int, newstate) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u16, protocol) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( struct inet_sock *inet = inet_sk(sk); struct in6_addr *pin6; __be32 *p32; __entry->skaddr = sk; __entry->oldstate = oldstate; __entry->newstate = newstate; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { pin6 = (struct in6_addr *)__entry->saddr_v6; *pin6 = sk->sk_v6_rcv_saddr; pin6 = (struct in6_addr *)__entry->daddr_v6; *pin6 = sk->sk_v6_daddr; } else #endif { pin6 = (struct in6_addr *)__entry->saddr_v6; ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); pin6 = (struct in6_addr *)__entry->daddr_v6; ipv6_addr_set_v4mapped(inet->inet_daddr, pin6); } ), TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s", show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, show_tcp_state_name(__entry->oldstate), show_tcp_state_name(__entry->newstate)) ); TRACE_EVENT(inet_sk_error_report, TP_PROTO(const struct sock *sk), TP_ARGS(sk), TP_STRUCT__entry( __field(int, error) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u16, protocol) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( struct inet_sock *inet = inet_sk(sk); struct in6_addr *pin6; __be32 *p32; __entry->error = sk->sk_err; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { pin6 = (struct in6_addr *)__entry->saddr_v6; *pin6 = sk->sk_v6_rcv_saddr; pin6 = (struct in6_addr *)__entry->daddr_v6; *pin6 = sk->sk_v6_daddr; } else #endif { pin6 = (struct in6_addr *)__entry->saddr_v6; ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); pin6 = (struct in6_addr *)__entry->daddr_v6; ipv6_addr_set_v4mapped(inet->inet_daddr, pin6); } ), TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c error=%d", show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, __entry->error) ); #endif /* _TRACE_SOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
22 3 4 4 2 2 4 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_IP6_TUNNEL_H #define _NET_IP6_TUNNEL_H #include <linux/ipv6.h> #include <linux/netdevice.h> #include <linux/if_tunnel.h> #include <linux/ip6_tunnel.h> #include <net/ip_tunnels.h> #include <net/dst_cache.h> #define IP6TUNNEL_ERR_TIMEO (30*HZ) /* capable of sending packets */ #define IP6_TNL_F_CAP_XMIT 0x10000 /* capable of receiving packets */ #define IP6_TNL_F_CAP_RCV 0x20000 /* determine capability on a per-packet basis */ #define IP6_TNL_F_CAP_PER_PACKET 0x40000 struct __ip6_tnl_parm { char name[IFNAMSIZ]; /* name of tunnel device */ int link; /* ifindex of underlying L2 interface */ __u8 proto; /* tunnel protocol */ __u8 encap_limit; /* encapsulation limit for tunnel */ __u8 hop_limit; /* hop limit for tunnel */ bool collect_md; __be32 flowinfo; /* traffic class and flowlabel for tunnel */ __u32 flags; /* tunnel flags */ struct in6_addr laddr; /* local tunnel end-point address */ struct in6_addr raddr; /* remote tunnel end-point address */ __be16 i_flags; __be16 o_flags; __be32 i_key; __be32 o_key; __u32 fwmark; __u32 index; /* ERSPAN type II index */ __u8 erspan_ver; /* ERSPAN version */ __u8 dir; /* direction */ __u16 hwid; /* hwid */ }; /* IPv6 tunnel */ struct ip6_tnl { struct ip6_tnl __rcu *next; /* next tunnel in list */ struct net_device *dev; /* virtual device associated with tunnel */ netdevice_tracker dev_tracker; struct net *net; /* netns for packet i/o */ struct __ip6_tnl_parm parms; /* tunnel configuration parameters */ struct flowi fl; /* flowi template for xmit */ struct dst_cache dst_cache; /* cached dst */ struct gro_cells gro_cells; int err_count; unsigned long err_time; /* These fields used only by GRE */ __u32 i_seqno; /* The last seen seqno */ atomic_t o_seqno; /* The last output seqno */ int hlen; /* tun_hlen + encap_hlen */ int tun_hlen; /* Precalculated header length */ int encap_hlen; /* Encap header length (FOU,GUE) */ struct ip_tunnel_encap encap; int mlink; }; struct ip6_tnl_encap_ops { size_t (*encap_hlen)(struct ip_tunnel_encap *e); int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi6 *fl6); int (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info); }; #ifdef CONFIG_INET extern const struct ip6_tnl_encap_ops __rcu * ip6tun_encaps[MAX_IPTUN_ENCAP_OPS]; int ip6_tnl_encap_add_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num); int ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num); int ip6_tnl_encap_setup(struct ip6_tnl *t, struct ip_tunnel_encap *ipencap); static inline int ip6_encap_hlen(struct ip_tunnel_encap *e) { const struct ip6_tnl_encap_ops *ops; int hlen = -EINVAL; if (e->type == TUNNEL_ENCAP_NONE) return 0; if (e->type >= MAX_IPTUN_ENCAP_OPS) return -EINVAL; rcu_read_lock(); ops = rcu_dereference(ip6tun_encaps[e->type]); if (likely(ops && ops->encap_hlen)) hlen = ops->encap_hlen(e); rcu_read_unlock(); return hlen; } static inline int ip6_tnl_encap(struct sk_buff *skb, struct ip6_tnl *t, u8 *protocol, struct flowi6 *fl6) { const struct ip6_tnl_encap_ops *ops; int ret = -EINVAL; if (t->encap.type == TUNNEL_ENCAP_NONE) return 0; if (t->encap.type >= MAX_IPTUN_ENCAP_OPS) return -EINVAL; rcu_read_lock(); ops = rcu_dereference(ip6tun_encaps[t->encap.type]); if (likely(ops && ops->build_header)) ret = ops->build_header(skb, &t->encap, protocol, fl6); rcu_read_unlock(); return ret; } /* Tunnel encapsulation limit destination sub-option */ struct ipv6_tlv_tnl_enc_lim { __u8 type; /* type-code for option */ __u8 length; /* option length */ __u8 encap_limit; /* tunnel encapsulation limit */ } __packed; int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr); int ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_error); int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr); int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct flowi6 *fl6, int encap_limit, __u32 *pmtu, __u8 proto); __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw); __u32 ip6_tnl_get_cap(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr); struct net *ip6_tnl_get_link_net(const struct net_device *dev); int ip6_tnl_get_iflink(const struct net_device *dev); int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu); static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb, struct net_device *dev) { int pkt_len, err; memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); pkt_len = skb->len - skb_inner_network_offset(skb); err = ip6_local_out(dev_net(skb_dst(skb)->dev), sk, skb); if (dev) { if (unlikely(net_xmit_eval(err))) pkt_len = -1; iptunnel_xmit_stats(dev, pkt_len); } } #endif #endif |
4 78 3703 5248 8 1882 3594 320 227 288 254 14 12 271 13 1 14 9 4 2 3 5 1 7 331 327 31 270 45 239 4 910 631 909 909 905 900 910 899 906 910 631 275 900 631 2 2 2 7 8 8 3 3 326 330 6 3 7 3 3 22 16 8 3 4 1 1 1 1 54 5 42 8 8 8 14 14 14 14 13 3 13 14 7 7 1914 1912 910 909 41 28 5 21 3 2 1 1 14 14 224 1 1 221 3 2 1 1 4 4 72 20 7 1 46 1084 1084 1 1080 278 631 518 518 7 881 881 3920 3924 3920 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 | // SPDX-License-Identifier: GPL-2.0-only /* * fs/libfs.c * Library for filesystems writers. */ #include <linux/blkdev.h> #include <linux/export.h> #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/mount.h> #include <linux/vfs.h> #include <linux/quotaops.h> #include <linux/mutex.h> #include <linux/namei.h> #include <linux/exportfs.h> #include <linux/iversion.h> #include <linux/writeback.h> #include <linux/buffer_head.h> /* sync_mapping_buffers */ #include <linux/fs_context.h> #include <linux/pseudo_fs.h> #include <linux/fsnotify.h> #include <linux/unicode.h> #include <linux/fscrypt.h> #include <linux/uaccess.h> #include "internal.h" int simple_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); generic_fillattr(&init_user_ns, inode, stat); stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9); return 0; } EXPORT_SYMBOL(simple_getattr); int simple_statfs(struct dentry *dentry, struct kstatfs *buf) { buf->f_type = dentry->d_sb->s_magic; buf->f_bsize = PAGE_SIZE; buf->f_namelen = NAME_MAX; return 0; } EXPORT_SYMBOL(simple_statfs); /* * Retaining negative dentries for an in-memory filesystem just wastes * memory and lookup time: arrange for them to be deleted immediately. */ int always_delete_dentry(const struct dentry *dentry) { return 1; } EXPORT_SYMBOL(always_delete_dentry); const struct dentry_operations simple_dentry_operations = { .d_delete = always_delete_dentry, }; EXPORT_SYMBOL(simple_dentry_operations); /* * Lookup the data. This is trivial - if the dentry didn't already * exist, we know it is negative. Set d_op to delete negative dentries. */ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); if (!dentry->d_sb->s_d_op) d_set_d_op(dentry, &simple_dentry_operations); d_add(dentry, NULL); return NULL; } EXPORT_SYMBOL(simple_lookup); int dcache_dir_open(struct inode *inode, struct file *file) { file->private_data = d_alloc_cursor(file->f_path.dentry); return file->private_data ? 0 : -ENOMEM; } EXPORT_SYMBOL(dcache_dir_open); int dcache_dir_close(struct inode *inode, struct file *file) { dput(file->private_data); return 0; } EXPORT_SYMBOL(dcache_dir_close); /* parent is locked at least shared */ /* * Returns an element of siblings' list. * We are looking for <count>th positive after <p>; if * found, dentry is grabbed and returned to caller. * If no such element exists, NULL is returned. */ static struct dentry *scan_positives(struct dentry *cursor, struct list_head *p, loff_t count, struct dentry *last) { struct dentry *dentry = cursor->d_parent, *found = NULL; spin_lock(&dentry->d_lock); while ((p = p->next) != &dentry->d_subdirs) { struct dentry *d = list_entry(p, struct dentry, d_child); // we must at least skip cursors, to avoid livelocks if (d->d_flags & DCACHE_DENTRY_CURSOR) continue; if (simple_positive(d) && !--count) { spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(d)) found = dget_dlock(d); spin_unlock(&d->d_lock); if (likely(found)) break; count = 1; } if (need_resched()) { list_move(&cursor->d_child, p); p = &cursor->d_child; spin_unlock(&dentry->d_lock); cond_resched(); spin_lock(&dentry->d_lock); } } spin_unlock(&dentry->d_lock); dput(last); return found; } loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry *dentry = file->f_path.dentry; switch (whence) { case 1: offset += file->f_pos; fallthrough; case 0: if (offset >= 0) break; fallthrough; default: return -EINVAL; } if (offset != file->f_pos) { struct dentry *cursor = file->private_data; struct dentry *to = NULL; inode_lock_shared(dentry->d_inode); if (offset > 2) to = scan_positives(cursor, &dentry->d_subdirs, offset - 2, NULL); spin_lock(&dentry->d_lock); if (to) list_move(&cursor->d_child, &to->d_child); else list_del_init(&cursor->d_child); spin_unlock(&dentry->d_lock); dput(to); file->f_pos = offset; inode_unlock_shared(dentry->d_inode); } return offset; } EXPORT_SYMBOL(dcache_dir_lseek); /* Relationship between i_mode and the DT_xxx types */ static inline unsigned char dt_type(struct inode *inode) { return (inode->i_mode >> 12) & 15; } /* * Directory is locked and all positive dentries in it are safe, since * for ramfs-type trees they can't go away without unlink() or rmdir(), * both impossible due to the lock on directory. */ int dcache_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; struct dentry *cursor = file->private_data; struct list_head *anchor = &dentry->d_subdirs; struct dentry *next = NULL; struct list_head *p; if (!dir_emit_dots(file, ctx)) return 0; if (ctx->pos == 2) p = anchor; else if (!list_empty(&cursor->d_child)) p = &cursor->d_child; else return 0; while ((next = scan_positives(cursor, p, 1, next)) != NULL) { if (!dir_emit(ctx, next->d_name.name, next->d_name.len, d_inode(next)->i_ino, dt_type(d_inode(next)))) break; ctx->pos++; p = &next->d_child; } spin_lock(&dentry->d_lock); if (next) list_move_tail(&cursor->d_child, &next->d_child); else list_del_init(&cursor->d_child); spin_unlock(&dentry->d_lock); dput(next); return 0; } EXPORT_SYMBOL(dcache_readdir); ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos) { return -EISDIR; } EXPORT_SYMBOL(generic_read_dir); const struct file_operations simple_dir_operations = { .open = dcache_dir_open, .release = dcache_dir_close, .llseek = dcache_dir_lseek, .read = generic_read_dir, .iterate_shared = dcache_readdir, .fsync = noop_fsync, }; EXPORT_SYMBOL(simple_dir_operations); const struct inode_operations simple_dir_inode_operations = { .lookup = simple_lookup, }; EXPORT_SYMBOL(simple_dir_inode_operations); static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev) { struct dentry *child = NULL; struct list_head *p = prev ? &prev->d_child : &parent->d_subdirs; spin_lock(&parent->d_lock); while ((p = p->next) != &parent->d_subdirs) { struct dentry *d = container_of(p, struct dentry, d_child); if (simple_positive(d)) { spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(d)) child = dget_dlock(d); spin_unlock(&d->d_lock); if (likely(child)) break; } } spin_unlock(&parent->d_lock); dput(prev); return child; } void simple_recursive_removal(struct dentry *dentry, void (*callback)(struct dentry *)) { struct dentry *this = dget(dentry); while (true) { struct dentry *victim = NULL, *child; struct inode *inode = this->d_inode; inode_lock(inode); if (d_is_dir(this)) inode->i_flags |= S_DEAD; while ((child = find_next_child(this, victim)) == NULL) { // kill and ascend // update metadata while it's still locked inode->i_ctime = current_time(inode); clear_nlink(inode); inode_unlock(inode); victim = this; this = this->d_parent; inode = this->d_inode; inode_lock(inode); if (simple_positive(victim)) { d_invalidate(victim); // avoid lost mounts if (d_is_dir(victim)) fsnotify_rmdir(inode, victim); else fsnotify_unlink(inode, victim); if (callback) callback(victim); dput(victim); // unpin it } if (victim == dentry) { inode->i_ctime = inode->i_mtime = current_time(inode); if (d_is_dir(dentry)) drop_nlink(inode); inode_unlock(inode); dput(dentry); return; } } inode_unlock(inode); this = child; } } EXPORT_SYMBOL(simple_recursive_removal); static const struct super_operations simple_super_operations = { .statfs = simple_statfs, }; static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc) { struct pseudo_fs_context *ctx = fc->fs_private; struct inode *root; s->s_maxbytes = MAX_LFS_FILESIZE; s->s_blocksize = PAGE_SIZE; s->s_blocksize_bits = PAGE_SHIFT; s->s_magic = ctx->magic; s->s_op = ctx->ops ?: &simple_super_operations; s->s_xattr = ctx->xattr; s->s_time_gran = 1; root = new_inode(s); if (!root) return -ENOMEM; /* * since this is the first inode, make it number 1. New inodes created * after this must take care not to collide with it (by passing * max_reserved of 1 to iunique). */ root->i_ino = 1; root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; root->i_atime = root->i_mtime = root->i_ctime = current_time(root); s->s_root = d_make_root(root); if (!s->s_root) return -ENOMEM; s->s_d_op = ctx->dops; return 0; } static int pseudo_fs_get_tree(struct fs_context *fc) { return get_tree_nodev(fc, pseudo_fs_fill_super); } static void pseudo_fs_free(struct fs_context *fc) { kfree(fc->fs_private); } static const struct fs_context_operations pseudo_fs_context_ops = { .free = pseudo_fs_free, .get_tree = pseudo_fs_get_tree, }; /* * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that * will never be mountable) */ struct pseudo_fs_context *init_pseudo(struct fs_context *fc, unsigned long magic) { struct pseudo_fs_context *ctx; ctx = kzalloc(sizeof(struct pseudo_fs_context), GFP_KERNEL); if (likely(ctx)) { ctx->magic = magic; fc->fs_private = ctx; fc->ops = &pseudo_fs_context_ops; fc->sb_flags |= SB_NOUSER; fc->global = true; } return ctx; } EXPORT_SYMBOL(init_pseudo); int simple_open(struct inode *inode, struct file *file) { if (inode->i_private) file->private_data = inode->i_private; return 0; } EXPORT_SYMBOL(simple_open); int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); inc_nlink(inode); ihold(inode); dget(dentry); d_instantiate(dentry, inode); return 0; } EXPORT_SYMBOL(simple_link); int simple_empty(struct dentry *dentry) { struct dentry *child; int ret = 0; spin_lock(&dentry->d_lock); list_for_each_entry(child, &dentry->d_subdirs, d_child) { spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(child)) { spin_unlock(&child->d_lock); goto out; } spin_unlock(&child->d_lock); } ret = 1; out: spin_unlock(&dentry->d_lock); return ret; } EXPORT_SYMBOL(simple_empty); int simple_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); drop_nlink(inode); dput(dentry); return 0; } EXPORT_SYMBOL(simple_unlink); int simple_rmdir(struct inode *dir, struct dentry *dentry) { if (!simple_empty(dentry)) return -ENOTEMPTY; drop_nlink(d_inode(dentry)); simple_unlink(dir, dentry); drop_nlink(dir); return 0; } EXPORT_SYMBOL(simple_rmdir); int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { bool old_is_dir = d_is_dir(old_dentry); bool new_is_dir = d_is_dir(new_dentry); if (old_dir != new_dir && old_is_dir != new_is_dir) { if (old_is_dir) { drop_nlink(old_dir); inc_nlink(new_dir); } else { drop_nlink(new_dir); inc_nlink(old_dir); } } old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime = new_dir->i_mtime = d_inode(old_dentry)->i_ctime = d_inode(new_dentry)->i_ctime = current_time(old_dir); return 0; } EXPORT_SYMBOL_GPL(simple_rename_exchange); int simple_rename(struct user_namespace *mnt_userns, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode *inode = d_inode(old_dentry); int they_are_dirs = d_is_dir(old_dentry); if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) return -EINVAL; if (flags & RENAME_EXCHANGE) return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); if (!simple_empty(new_dentry)) return -ENOTEMPTY; if (d_really_is_positive(new_dentry)) { simple_unlink(new_dir, new_dentry); if (they_are_dirs) { drop_nlink(d_inode(new_dentry)); drop_nlink(old_dir); } } else if (they_are_dirs) { drop_nlink(old_dir); inc_nlink(new_dir); } old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime = new_dir->i_mtime = inode->i_ctime = current_time(old_dir); return 0; } EXPORT_SYMBOL(simple_rename); /** * simple_setattr - setattr for simple filesystem * @mnt_userns: user namespace of the target mount * @dentry: dentry * @iattr: iattr structure * * Returns 0 on success, -error on failure. * * simple_setattr is a simple ->setattr implementation without a proper * implementation of size changes. * * It can either be used for in-memory filesystems or special files * on simple regular filesystems. Anything that needs to change on-disk * or wire state on size changes needs its own setattr method. */ int simple_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); int error; error = setattr_prepare(mnt_userns, dentry, iattr); if (error) return error; if (iattr->ia_valid & ATTR_SIZE) truncate_setsize(inode, iattr->ia_size); setattr_copy(mnt_userns, inode, iattr); mark_inode_dirty(inode); return 0; } EXPORT_SYMBOL(simple_setattr); static int simple_read_folio(struct file *file, struct folio *folio) { folio_zero_range(folio, 0, folio_size(folio)); flush_dcache_folio(folio); folio_mark_uptodate(folio); folio_unlock(folio); return 0; } int simple_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata) { struct page *page; pgoff_t index; index = pos >> PAGE_SHIFT; page = grab_cache_page_write_begin(mapping, index); if (!page) return -ENOMEM; *pagep = page; if (!PageUptodate(page) && (len != PAGE_SIZE)) { unsigned from = pos & (PAGE_SIZE - 1); zero_user_segments(page, 0, from, from + len, PAGE_SIZE); } return 0; } EXPORT_SYMBOL(simple_write_begin); /** * simple_write_end - .write_end helper for non-block-device FSes * @file: See .write_end of address_space_operations * @mapping: " * @pos: " * @len: " * @copied: " * @page: " * @fsdata: " * * simple_write_end does the minimum needed for updating a page after writing is * done. It has the same API signature as the .write_end of * address_space_operations vector. So it can just be set onto .write_end for * FSes that don't need any other processing. i_mutex is assumed to be held. * Block based filesystems should use generic_write_end(). * NOTE: Even though i_size might get updated by this function, mark_inode_dirty * is not called, so a filesystem that actually does store data in .write_inode * should extend on what's done here with a call to mark_inode_dirty() in the * case that i_size has changed. * * Use *ONLY* with simple_read_folio() */ static int simple_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { struct inode *inode = page->mapping->host; loff_t last_pos = pos + copied; /* zero the stale part of the page if we did a short copy */ if (!PageUptodate(page)) { if (copied < len) { unsigned from = pos & (PAGE_SIZE - 1); zero_user(page, from + copied, len - copied); } SetPageUptodate(page); } /* * No need to use i_size_read() here, the i_size * cannot change under us because we hold the i_mutex. */ if (last_pos > inode->i_size) i_size_write(inode, last_pos); set_page_dirty(page); unlock_page(page); put_page(page); return copied; } /* * Provides ramfs-style behavior: data in the pagecache, but no writeback. */ const struct address_space_operations ram_aops = { .read_folio = simple_read_folio, .write_begin = simple_write_begin, .write_end = simple_write_end, .dirty_folio = noop_dirty_folio, }; EXPORT_SYMBOL(ram_aops); /* * the inodes created here are not hashed. If you use iunique to generate * unique inode values later for this filesystem, then you must take care * to pass it an appropriate max_reserved value to avoid collisions. */ int simple_fill_super(struct super_block *s, unsigned long magic, const struct tree_descr *files) { struct inode *inode; struct dentry *root; struct dentry *dentry; int i; s->s_blocksize = PAGE_SIZE; s->s_blocksize_bits = PAGE_SHIFT; s->s_magic = magic; s->s_op = &simple_super_operations; s->s_time_gran = 1; inode = new_inode(s); if (!inode) return -ENOMEM; /* * because the root inode is 1, the files array must not contain an * entry at index 1 */ inode->i_ino = 1; inode->i_mode = S_IFDIR | 0755; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; set_nlink(inode, 2); root = d_make_root(inode); if (!root) return -ENOMEM; for (i = 0; !files->name || files->name[0]; i++, files++) { if (!files->name) continue; /* warn if it tries to conflict with the root inode */ if (unlikely(i == 1)) printk(KERN_WARNING "%s: %s passed in a files array" "with an index of 1!\n", __func__, s->s_type->name); dentry = d_alloc_name(root, files->name); if (!dentry) goto out; inode = new_inode(s); if (!inode) { dput(dentry); goto out; } inode->i_mode = S_IFREG | files->mode; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_fop = files->ops; inode->i_ino = i; d_add(dentry, inode); } s->s_root = root; return 0; out: d_genocide(root); shrink_dcache_parent(root); dput(root); return -ENOMEM; } EXPORT_SYMBOL(simple_fill_super); static DEFINE_SPINLOCK(pin_fs_lock); int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count) { struct vfsmount *mnt = NULL; spin_lock(&pin_fs_lock); if (unlikely(!*mount)) { spin_unlock(&pin_fs_lock); mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL); if (IS_ERR(mnt)) return PTR_ERR(mnt); spin_lock(&pin_fs_lock); if (!*mount) *mount = mnt; } mntget(*mount); ++*count; spin_unlock(&pin_fs_lock); mntput(mnt); return 0; } EXPORT_SYMBOL(simple_pin_fs); void simple_release_fs(struct vfsmount **mount, int *count) { struct vfsmount *mnt; spin_lock(&pin_fs_lock); mnt = *mount; if (!--*count) *mount = NULL; spin_unlock(&pin_fs_lock); mntput(mnt); } EXPORT_SYMBOL(simple_release_fs); /** * simple_read_from_buffer - copy data from the buffer to user space * @to: the user space buffer to read to * @count: the maximum number of bytes to read * @ppos: the current position in the buffer * @from: the buffer to read from * @available: the size of the buffer * * The simple_read_from_buffer() function reads up to @count bytes from the * buffer @from at offset @ppos into the user space address starting at @to. * * On success, the number of bytes read is returned and the offset @ppos is * advanced by this number, or negative value is returned on error. **/ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, const void *from, size_t available) { loff_t pos = *ppos; size_t ret; if (pos < 0) return -EINVAL; if (pos >= available || !count) return 0; if (count > available - pos) count = available - pos; ret = copy_to_user(to, from + pos, count); if (ret == count) return -EFAULT; count -= ret; *ppos = pos + count; return count; } EXPORT_SYMBOL(simple_read_from_buffer); /** * simple_write_to_buffer - copy data from user space to the buffer * @to: the buffer to write to * @available: the size of the buffer * @ppos: the current position in the buffer * @from: the user space buffer to read from * @count: the maximum number of bytes to read * * The simple_write_to_buffer() function reads up to @count bytes from the user * space address starting at @from into the buffer @to at offset @ppos. * * On success, the number of bytes written is returned and the offset @ppos is * advanced by this number, or negative value is returned on error. **/ ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, const void __user *from, size_t count) { loff_t pos = *ppos; size_t res; if (pos < 0) return -EINVAL; if (pos >= available || !count) return 0; if (count > available - pos) count = available - pos; res = copy_from_user(to + pos, from, count); if (res == count) return -EFAULT; count -= res; *ppos = pos + count; return count; } EXPORT_SYMBOL(simple_write_to_buffer); /** * memory_read_from_buffer - copy data from the buffer * @to: the kernel space buffer to read to * @count: the maximum number of bytes to read * @ppos: the current position in the buffer * @from: the buffer to read from * @available: the size of the buffer * * The memory_read_from_buffer() function reads up to @count bytes from the * buffer @from at offset @ppos into the kernel space address starting at @to. * * On success, the number of bytes read is returned and the offset @ppos is * advanced by this number, or negative value is returned on error. **/ ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, const void *from, size_t available) { loff_t pos = *ppos; if (pos < 0) return -EINVAL; if (pos >= available) return 0; if (count > available - pos) count = available - pos; memcpy(to, from + pos, count); *ppos = pos + count; return count; } EXPORT_SYMBOL(memory_read_from_buffer); /* * Transaction based IO. * The file expects a single write which triggers the transaction, and then * possibly a read which collects the result - which is stored in a * file-local buffer. */ void simple_transaction_set(struct file *file, size_t n) { struct simple_transaction_argresp *ar = file->private_data; BUG_ON(n > SIMPLE_TRANSACTION_LIMIT); /* * The barrier ensures that ar->size will really remain zero until * ar->data is ready for reading. */ smp_mb(); ar->size = n; } EXPORT_SYMBOL(simple_transaction_set); char *simple_transaction_get(struct file *file, const char __user *buf, size_t size) { struct simple_transaction_argresp *ar; static DEFINE_SPINLOCK(simple_transaction_lock); if (size > SIMPLE_TRANSACTION_LIMIT - 1) return ERR_PTR(-EFBIG); ar = (struct simple_transaction_argresp *)get_zeroed_page(GFP_KERNEL); if (!ar) return ERR_PTR(-ENOMEM); spin_lock(&simple_transaction_lock); /* only one write allowed per open */ if (file->private_data) { spin_unlock(&simple_transaction_lock); free_page((unsigned long)ar); return ERR_PTR(-EBUSY); } file->private_data = ar; spin_unlock(&simple_transaction_lock); if (copy_from_user(ar->data, buf, size)) return ERR_PTR(-EFAULT); return ar->data; } EXPORT_SYMBOL(simple_transaction_get); ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) { struct simple_transaction_argresp *ar = file->private_data; if (!ar) return 0; return simple_read_from_buffer(buf, size, pos, ar->data, ar->size); } EXPORT_SYMBOL(simple_transaction_read); int simple_transaction_release(struct inode *inode, struct file *file) { free_page((unsigned long)file->private_data); return 0; } EXPORT_SYMBOL(simple_transaction_release); /* Simple attribute files */ struct simple_attr { int (*get)(void *, u64 *); int (*set)(void *, u64); char get_buf[24]; /* enough to store a u64 and "\n\0" */ char set_buf[24]; void *data; const char *fmt; /* format for read operation */ struct mutex mutex; /* protects access to these buffers */ }; /* simple_attr_open is called by an actual attribute open file operation * to set the attribute specific access operations. */ int simple_attr_open(struct inode *inode, struct file *file, int (*get)(void *, u64 *), int (*set)(void *, u64), const char *fmt) { struct simple_attr *attr; attr = kzalloc(sizeof(*attr), GFP_KERNEL); if (!attr) return -ENOMEM; attr->get = get; attr->set = set; attr->data = inode->i_private; attr->fmt = fmt; mutex_init(&attr->mutex); file->private_data = attr; return nonseekable_open(inode, file); } EXPORT_SYMBOL_GPL(simple_attr_open); int simple_attr_release(struct inode *inode, struct file *file) { kfree(file->private_data); return 0; } EXPORT_SYMBOL_GPL(simple_attr_release); /* GPL-only? This? Really? */ /* read from the buffer that is filled with the get function */ ssize_t simple_attr_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { struct simple_attr *attr; size_t size; ssize_t ret; attr = file->private_data; if (!attr->get) return -EACCES; ret = mutex_lock_interruptible(&attr->mutex); if (ret) return ret; if (*ppos && attr->get_buf[0]) { /* continued read */ size = strlen(attr->get_buf); } else { /* first read */ u64 val; ret = attr->get(attr->data, &val); if (ret) goto out; size = scnprintf(attr->get_buf, sizeof(attr->get_buf), attr->fmt, (unsigned long long)val); } ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size); out: mutex_unlock(&attr->mutex); return ret; } EXPORT_SYMBOL_GPL(simple_attr_read); /* interpret the buffer as a number to call the set function with */ static ssize_t simple_attr_write_xsigned(struct file *file, const char __user *buf, size_t len, loff_t *ppos, bool is_signed) { struct simple_attr *attr; unsigned long long val; size_t size; ssize_t ret; attr = file->private_data; if (!attr->set) return -EACCES; ret = mutex_lock_interruptible(&attr->mutex); if (ret) return ret; ret = -EFAULT; size = min(sizeof(attr->set_buf) - 1, len); if (copy_from_user(attr->set_buf, buf, size)) goto out; attr->set_buf[size] = '\0'; if (is_signed) ret = kstrtoll(attr->set_buf, 0, &val); else ret = kstrtoull(attr->set_buf, 0, &val); if (ret) goto out; ret = attr->set(attr->data, val); if (ret == 0) ret = len; /* on success, claim we got the whole input */ out: mutex_unlock(&attr->mutex); return ret; } ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { return simple_attr_write_xsigned(file, buf, len, ppos, false); } EXPORT_SYMBOL_GPL(simple_attr_write); ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { return simple_attr_write_xsigned(file, buf, len, ppos, true); } EXPORT_SYMBOL_GPL(simple_attr_write_signed); /** * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation * @sb: filesystem to do the file handle conversion on * @fid: file handle to convert * @fh_len: length of the file handle in bytes * @fh_type: type of file handle * @get_inode: filesystem callback to retrieve inode * * This function decodes @fid as long as it has one of the well-known * Linux filehandle types and calls @get_inode on it to retrieve the * inode for the object specified in the file handle. */ struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type, struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)) { struct inode *inode = NULL; if (fh_len < 2) return NULL; switch (fh_type) { case FILEID_INO32_GEN: case FILEID_INO32_GEN_PARENT: inode = get_inode(sb, fid->i32.ino, fid->i32.gen); break; } return d_obtain_alias(inode); } EXPORT_SYMBOL_GPL(generic_fh_to_dentry); /** * generic_fh_to_parent - generic helper for the fh_to_parent export operation * @sb: filesystem to do the file handle conversion on * @fid: file handle to convert * @fh_len: length of the file handle in bytes * @fh_type: type of file handle * @get_inode: filesystem callback to retrieve inode * * This function decodes @fid as long as it has one of the well-known * Linux filehandle types and calls @get_inode on it to retrieve the * inode for the _parent_ object specified in the file handle if it * is specified in the file handle, or NULL otherwise. */ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type, struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)) { struct inode *inode = NULL; if (fh_len <= 2) return NULL; switch (fh_type) { case FILEID_INO32_GEN_PARENT: inode = get_inode(sb, fid->i32.parent_ino, (fh_len > 3 ? fid->i32.parent_gen : 0)); break; } return d_obtain_alias(inode); } EXPORT_SYMBOL_GPL(generic_fh_to_parent); /** * __generic_file_fsync - generic fsync implementation for simple filesystems * * @file: file to synchronize * @start: start offset in bytes * @end: end offset in bytes (inclusive) * @datasync: only synchronize essential metadata if true * * This is a generic implementation of the fsync method for simple * filesystems which track all non-inode metadata in the buffers list * hanging off the address_space structure. */ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; int err; int ret; err = file_write_and_wait_range(file, start, end); if (err) return err; inode_lock(inode); ret = sync_mapping_buffers(inode->i_mapping); if (!(inode->i_state & I_DIRTY_ALL)) goto out; if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) goto out; err = sync_inode_metadata(inode, 1); if (ret == 0) ret = err; out: inode_unlock(inode); /* check and advance again to catch errors after syncing out buffers */ err = file_check_and_advance_wb_err(file); if (ret == 0) ret = err; return ret; } EXPORT_SYMBOL(__generic_file_fsync); /** * generic_file_fsync - generic fsync implementation for simple filesystems * with flush * @file: file to synchronize * @start: start offset in bytes * @end: end offset in bytes (inclusive) * @datasync: only synchronize essential metadata if true * */ int generic_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; int err; err = __generic_file_fsync(file, start, end, datasync); if (err) return err; return blkdev_issue_flush(inode->i_sb->s_bdev); } EXPORT_SYMBOL(generic_file_fsync); /** * generic_check_addressable - Check addressability of file system * @blocksize_bits: log of file system block size * @num_blocks: number of blocks in file system * * Determine whether a file system with @num_blocks blocks (and a * block size of 2**@blocksize_bits) is addressable by the sector_t * and page cache of the system. Return 0 if so and -EFBIG otherwise. */ int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks) { u64 last_fs_block = num_blocks - 1; u64 last_fs_page = last_fs_block >> (PAGE_SHIFT - blocksize_bits); if (unlikely(num_blocks == 0)) return 0; if ((blocksize_bits < 9) || (blocksize_bits > PAGE_SHIFT)) return -EINVAL; if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) || (last_fs_page > (pgoff_t)(~0ULL))) { return -EFBIG; } return 0; } EXPORT_SYMBOL(generic_check_addressable); /* * No-op implementation of ->fsync for in-memory filesystems. */ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync) { return 0; } EXPORT_SYMBOL(noop_fsync); ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { /* * iomap based filesystems support direct I/O without need for * this callback. However, it still needs to be set in * inode->a_ops so that open/fcntl know that direct I/O is * generally supported. */ return -EINVAL; } EXPORT_SYMBOL_GPL(noop_direct_IO); /* Because kfree isn't assignment-compatible with void(void*) ;-/ */ void kfree_link(void *p) { kfree(p); } EXPORT_SYMBOL(kfree_link); struct inode *alloc_anon_inode(struct super_block *s) { static const struct address_space_operations anon_aops = { .dirty_folio = noop_dirty_folio, }; struct inode *inode = new_inode_pseudo(s); if (!inode) return ERR_PTR(-ENOMEM); inode->i_ino = get_next_ino(); inode->i_mapping->a_ops = &anon_aops; /* * Mark the inode dirty from the very beginning, * that way it will never be moved to the dirty * list because mark_inode_dirty() will think * that it already _is_ on the dirty list. */ inode->i_state = I_DIRTY; inode->i_mode = S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_flags |= S_PRIVATE; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); return inode; } EXPORT_SYMBOL(alloc_anon_inode); /** * simple_nosetlease - generic helper for prohibiting leases * @filp: file pointer * @arg: type of lease to obtain * @flp: new lease supplied for insertion * @priv: private data for lm_setup operation * * Generic helper for filesystems that do not wish to allow leases to be set. * All arguments are ignored and it just returns -EINVAL. */ int simple_nosetlease(struct file *filp, long arg, struct file_lock **flp, void **priv) { return -EINVAL; } EXPORT_SYMBOL(simple_nosetlease); /** * simple_get_link - generic helper to get the target of "fast" symlinks * @dentry: not used here * @inode: the symlink inode * @done: not used here * * Generic helper for filesystems to use for symlink inodes where a pointer to * the symlink target is stored in ->i_link. NOTE: this isn't normally called, * since as an optimization the path lookup code uses any non-NULL ->i_link * directly, without calling ->get_link(). But ->get_link() still must be set, * to mark the inode_operations as being for a symlink. * * Return: the symlink target */ const char *simple_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { return inode->i_link; } EXPORT_SYMBOL(simple_get_link); const struct inode_operations simple_symlink_inode_operations = { .get_link = simple_get_link, }; EXPORT_SYMBOL(simple_symlink_inode_operations); /* * Operations for a permanently empty directory. */ static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return ERR_PTR(-ENOENT); } static int empty_dir_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); generic_fillattr(&init_user_ns, inode, stat); return 0; } static int empty_dir_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *attr) { return -EPERM; } static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size) { return -EOPNOTSUPP; } static const struct inode_operations empty_dir_inode_operations = { .lookup = empty_dir_lookup, .permission = generic_permission, .setattr = empty_dir_setattr, .getattr = empty_dir_getattr, .listxattr = empty_dir_listxattr, }; static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence) { /* An empty directory has two entries . and .. at offsets 0 and 1 */ return generic_file_llseek_size(file, offset, whence, 2, 2); } static int empty_dir_readdir(struct file *file, struct dir_context *ctx) { dir_emit_dots(file, ctx); return 0; } static const struct file_operations empty_dir_operations = { .llseek = empty_dir_llseek, .read = generic_read_dir, .iterate_shared = empty_dir_readdir, .fsync = noop_fsync, }; void make_empty_dir_inode(struct inode *inode) { set_nlink(inode, 2); inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_rdev = 0; inode->i_size = 0; inode->i_blkbits = PAGE_SHIFT; inode->i_blocks = 0; inode->i_op = &empty_dir_inode_operations; inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &empty_dir_operations; } bool is_empty_dir_inode(struct inode *inode) { return (inode->i_fop == &empty_dir_operations) && (inode->i_op == &empty_dir_inode_operations); } #if IS_ENABLED(CONFIG_UNICODE) /* * Determine if the name of a dentry should be casefolded. * * Return: if names will need casefolding */ static bool needs_casefold(const struct inode *dir) { return IS_CASEFOLDED(dir) && dir->i_sb->s_encoding; } /** * generic_ci_d_compare - generic d_compare implementation for casefolding filesystems * @dentry: dentry whose name we are checking against * @len: len of name of dentry * @str: str pointer to name of dentry * @name: Name to compare against * * Return: 0 if names match, 1 if mismatch, or -ERRNO */ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { const struct dentry *parent = READ_ONCE(dentry->d_parent); const struct inode *dir = READ_ONCE(parent->d_inode); const struct super_block *sb = dentry->d_sb; const struct unicode_map *um = sb->s_encoding; struct qstr qstr = QSTR_INIT(str, len); char strbuf[DNAME_INLINE_LEN]; int ret; if (!dir || !needs_casefold(dir)) goto fallback; /* * If the dentry name is stored in-line, then it may be concurrently * modified by a rename. If this happens, the VFS will eventually retry * the lookup, so it doesn't matter what ->d_compare() returns. * However, it's unsafe to call utf8_strncasecmp() with an unstable * string. Therefore, we have to copy the name into a temporary buffer. */ if (len <= DNAME_INLINE_LEN - 1) { memcpy(strbuf, str, len); strbuf[len] = 0; qstr.name = strbuf; /* prevent compiler from optimizing out the temporary buffer */ barrier(); } ret = utf8_strncasecmp(um, name, &qstr); if (ret >= 0) return ret; if (sb_has_strict_encoding(sb)) return -EINVAL; fallback: if (len != name->len) return 1; return !!memcmp(str, name->name, len); } /** * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems * @dentry: dentry of the parent directory * @str: qstr of name whose hash we should fill in * * Return: 0 if hash was successful or unchanged, and -EINVAL on error */ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) { const struct inode *dir = READ_ONCE(dentry->d_inode); struct super_block *sb = dentry->d_sb; const struct unicode_map *um = sb->s_encoding; int ret = 0; if (!dir || !needs_casefold(dir)) return 0; ret = utf8_casefold_hash(um, dentry, str); if (ret < 0 && sb_has_strict_encoding(sb)) return -EINVAL; return 0; } static const struct dentry_operations generic_ci_dentry_ops = { .d_hash = generic_ci_d_hash, .d_compare = generic_ci_d_compare, }; #endif #ifdef CONFIG_FS_ENCRYPTION static const struct dentry_operations generic_encrypted_dentry_ops = { .d_revalidate = fscrypt_d_revalidate, }; #endif #if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE) static const struct dentry_operations generic_encrypted_ci_dentry_ops = { .d_hash = generic_ci_d_hash, .d_compare = generic_ci_d_compare, .d_revalidate = fscrypt_d_revalidate, }; #endif /** * generic_set_encrypted_ci_d_ops - helper for setting d_ops for given dentry * @dentry: dentry to set ops on * * Casefolded directories need d_hash and d_compare set, so that the dentries * contained in them are handled case-insensitively. Note that these operations * are needed on the parent directory rather than on the dentries in it, and * while the casefolding flag can be toggled on and off on an empty directory, * dentry_operations can't be changed later. As a result, if the filesystem has * casefolding support enabled at all, we have to give all dentries the * casefolding operations even if their inode doesn't have the casefolding flag * currently (and thus the casefolding ops would be no-ops for now). * * Encryption works differently in that the only dentry operation it needs is * d_revalidate, which it only needs on dentries that have the no-key name flag. * The no-key flag can't be set "later", so we don't have to worry about that. * * Finally, to maximize compatibility with overlayfs (which isn't compatible * with certain dentry operations) and to avoid taking an unnecessary * performance hit, we use custom dentry_operations for each possible * combination rather than always installing all operations. */ void generic_set_encrypted_ci_d_ops(struct dentry *dentry) { #ifdef CONFIG_FS_ENCRYPTION bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME; #endif #if IS_ENABLED(CONFIG_UNICODE) bool needs_ci_ops = dentry->d_sb->s_encoding; #endif #if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE) if (needs_encrypt_ops && needs_ci_ops) { d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops); return; } #endif #ifdef CONFIG_FS_ENCRYPTION if (needs_encrypt_ops) { d_set_d_op(dentry, &generic_encrypted_dentry_ops); return; } #endif #if IS_ENABLED(CONFIG_UNICODE) if (needs_ci_ops) { d_set_d_op(dentry, &generic_ci_dentry_ops); return; } #endif } EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops); /** * inode_maybe_inc_iversion - increments i_version * @inode: inode with the i_version that should be updated * @force: increment the counter even if it's not necessary? * * Every time the inode is modified, the i_version field must be seen to have * changed by any observer. * * If "force" is set or the QUERIED flag is set, then ensure that we increment * the value, and clear the queried flag. * * In the common case where neither is set, then we can return "false" without * updating i_version. * * If this function returns false, and no other metadata has changed, then we * can avoid logging the metadata. */ bool inode_maybe_inc_iversion(struct inode *inode, bool force) { u64 cur, new; /* * The i_version field is not strictly ordered with any other inode * information, but the legacy inode_inc_iversion code used a spinlock * to serialize increments. * * Here, we add full memory barriers to ensure that any de-facto * ordering with other info is preserved. * * This barrier pairs with the barrier in inode_query_iversion() */ smp_mb(); cur = inode_peek_iversion_raw(inode); do { /* If flag is clear then we needn't do anything */ if (!force && !(cur & I_VERSION_QUERIED)) return false; /* Since lowest bit is flag, add 2 to avoid it */ new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); return true; } EXPORT_SYMBOL(inode_maybe_inc_iversion); |
1644 58 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __SHMEM_FS_H #define __SHMEM_FS_H #include <linux/file.h> #include <linux/swap.h> #include <linux/mempolicy.h> #include <linux/pagemap.h> #include <linux/percpu_counter.h> #include <linux/xattr.h> #include <linux/fs_parser.h> #include <linux/android_vendor.h> /* inode in-kernel data */ struct shmem_inode_info { spinlock_t lock; unsigned int seals; /* shmem seals */ unsigned long flags; unsigned long alloced; /* data pages alloced to file */ unsigned long swapped; /* subtotal assigned to swap */ pgoff_t fallocend; /* highest fallocate endindex */ struct list_head shrinklist; /* shrinkable hpage inodes */ struct list_head swaplist; /* chain of maybes on swap */ struct shared_policy policy; /* NUMA memory alloc policy */ struct simple_xattrs xattrs; /* list of xattrs */ atomic_t stop_eviction; /* hold when working on inode */ struct timespec64 i_crtime; /* file creation time */ unsigned int fsflags; /* flags for FS_IOC_[SG]ETFLAGS */ struct inode vfs_inode; ANDROID_VENDOR_DATA(1); }; #define SHMEM_FL_USER_VISIBLE FS_FL_USER_VISIBLE #define SHMEM_FL_USER_MODIFIABLE \ (FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL) #define SHMEM_FL_INHERITED (FS_NODUMP_FL | FS_NOATIME_FL) struct shmem_sb_info { unsigned long max_blocks; /* How many blocks are allowed */ struct percpu_counter used_blocks; /* How many are allocated */ unsigned long max_inodes; /* How many inodes are allowed */ unsigned long free_inodes; /* How many are left for allocation */ raw_spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ umode_t mode; /* Mount mode for root directory */ unsigned char huge; /* Whether to try for hugepages */ kuid_t uid; /* Mount uid for root directory */ kgid_t gid; /* Mount gid for root directory */ bool full_inums; /* If i_ino should be uint or ino_t */ ino_t next_ino; /* The next per-sb inode number to use */ ino_t __percpu *ino_batch; /* The next per-cpu inode number to use */ struct mempolicy *mpol; /* default memory policy for mappings */ spinlock_t shrinklist_lock; /* Protects shrinklist */ struct list_head shrinklist; /* List of shinkable inodes */ unsigned long shrinklist_len; /* Length of shrinklist */ }; static inline struct shmem_inode_info *SHMEM_I(struct inode *inode) { return container_of(inode, struct shmem_inode_info, vfs_inode); } /* * Functions in mm/shmem.c called directly from elsewhere: */ extern const struct fs_parameter_spec shmem_fs_parameters[]; extern void shmem_init(void); extern int shmem_init_fs_context(struct fs_context *fc); extern struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags); extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags); extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, loff_t size, unsigned long flags); extern int shmem_zero_setup(struct vm_area_struct *); extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts); #ifdef CONFIG_SHMEM extern const struct address_space_operations shmem_aops; static inline bool shmem_mapping(struct address_space *mapping) { return mapping->a_ops == &shmem_aops; } #else static inline bool shmem_mapping(struct address_space *mapping) { return false; } #endif /* CONFIG_SHMEM */ extern void shmem_unlock_mapping(struct address_space *mapping); extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); extern bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, pgoff_t index, bool shmem_huge_force); static inline bool shmem_huge_enabled(struct vm_area_struct *vma, bool shmem_huge_force) { return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff, shmem_huge_force); } extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); /* Flag allocation requirements to shmem_get_folio */ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_NOALLOC, /* similar, but fail on hole or use fallocated page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp); static inline struct page *shmem_read_mapping_page( struct address_space *mapping, pgoff_t index) { return shmem_read_mapping_page_gfp(mapping, index, mapping_gfp_mask(mapping)); } static inline bool shmem_file(struct file *file) { if (!IS_ENABLED(CONFIG_SHMEM)) return false; if (!file || !file->f_mapping) return false; return shmem_mapping(file->f_mapping); } /* * If fallocate(FALLOC_FL_KEEP_SIZE) has been used, there may be pages * beyond i_size's notion of EOF, which fallocate has committed to reserving: * which split_huge_page() must therefore not delete. This use of a single * "fallocend" per inode errs on the side of not deleting a reservation when * in doubt: there are plenty of cases when it preserves unreserved pages. */ static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof) { return max(eof, SHMEM_I(inode)->fallocend); } extern bool shmem_charge(struct inode *inode, long pages); extern void shmem_uncharge(struct inode *inode, long pages); #ifdef CONFIG_USERFAULTFD #ifdef CONFIG_SHMEM extern int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, bool zeropage, bool wp_copy, struct page **pagep); #else /* !CONFIG_SHMEM */ #define shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, \ src_addr, zeropage, wp_copy, pagep) ({ BUG(); 0; }) #endif /* CONFIG_SHMEM */ #endif /* CONFIG_USERFAULTFD */ #endif |
44 44 44 44 19480 18505 140 4058 9974 9966 16578 16495 2480 790 789 5016 441 1 3811 35 25 1844 25 1289 1 2358 1089 1290 2360 2358 2355 2357 2476 2361 129 2480 2477 2 2 2 2 129 2322 35 2334 17 7 2360 2357 2360 2361 2361 2360 2360 61 27 1 37 10 8 10 9 37 87 88 88 88 87 84 88 7011 371 3486 2262 498 383 371 761 97 572 1404 98 1 1228 2238 1 2015 228 2220 17 1695 1692 1 22 1438 301 9202 9202 6719 2323 5779 35 2795 1528 9 1725 1449 2248 2240 18 11 1 2071 179 2 23 5745 11 18047 11469 943 24407 345 10414 1615 1610 4398 4392 5387 4362 4369 4370 107 4288 2388 2389 2367 2333 2334 2334 2317 9 2334 466 467 467 467 441 441 441 407 66 327 146 146 143 6650 9920 3192 498 3886 5390 5390 17 25 24 7 3 3 3 2 1 1 122 50 72 8 52 24780 24861 107 47 61 4 4 63 26 39 3718 1 1 91 49 1 114 114 1 113 113 1 1 3458 3783 26 3471 3809 3815 3816 34 1 32 34 3797 3272 3776 34 156 156 2479 451 4181 135 4109 35 35 79 2638 2633 4 10363 10361 5011 5016 3 3 3 3 3 53 53 53 52 1962 1965 1963 1925 1965 1962 1963 1965 335 335 335 335 1357 110 286 164 999 128 131 131 439 441 46 46 1164 1165 1164 900 899 9972 1159 9972 9986 9973 9968 9966 872 864 7 514 424 426 329 331 79 6 76 656 1 546 1 1 110 2 48 1 5 2 105 408 377 31 31 531 532 206 208 256 1 254 1 57 57 323 14 310 3 307 296 222 71 71 2 21 48 25 21 94 94 94 94 79 5317 5313 3313 8670 8720 8665 3313 14286 8666 8655 1510 7721 8829 221 18 3 8649 2676 2114 623 64 637 2215 2935 2934 2552 2098 530 498 152 149 9 1 3 2 5 8 1 257 492 448 41 64 86 72 13 1 50 49 5388 5393 5393 890 2142 831 20 20 20 20 20 9 7 2 30 5 331 37 29 27 8 6 2 226 225 18 217 388 17 154 9 143 4 156 700 698 699 20 14 42 7 11621 3488 556 7 3034 3349 3345 556 6 2896 425 3346 727 836 836 836 7 155 2 143 8 91 56 207 263 150 211 1021 1026 8 145 878 1 79 1 85 1 1022 229 230 82 82 82 82 9572 9570 1966 1961 92 92 22 23 3227 3233 852 856 136 134 16 16 375 9557 9431 128 183 4 4 1 2 7229 7228 675 6611 5 4332 4330 4332 1515 8 1418 17 17 17 43 42 9 9 47 188 188 84 191 4 206 21 1 21 1452 1444 1447 6610 14 173 1961 4643 53 53 6586 9135 9136 15 6 9 3 2 2 3 2 3 8 7 62 16 18 3 16 9 1 19 48 5 36 2 34 13 17 1 1 15 2 4 1 351 1 337 369 369 545 48 3 3 11333 7413 17 4351 8 1330 4174 1327 1326 5648 5648 1653 5 5 69 425 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 | // SPDX-License-Identifier: GPL-2.0-only /* * NSA Security-Enhanced Linux (SELinux) security module * * This file contains the SELinux hook function implementations. * * Authors: Stephen Smalley, <sds@tycho.nsa.gov> * Chris Vance, <cvance@nai.com> * Wayne Salamon, <wsalamon@nai.com> * James Morris <jmorris@redhat.com> * * Copyright (C) 2001,2002 Networks Associates Technology, Inc. * Copyright (C) 2003-2008 Red Hat, Inc., James Morris <jmorris@redhat.com> * Eric Paris <eparis@redhat.com> * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. * <dgoeddel@trustedcs.com> * Copyright (C) 2006, 2007, 2009 Hewlett-Packard Development Company, L.P. * Paul Moore <paul@paul-moore.com> * Copyright (C) 2007 Hitachi Software Engineering Co., Ltd. * Yuichi Nakamura <ynakam@hitachisoft.jp> * Copyright (C) 2016 Mellanox Technologies */ #include <linux/init.h> #include <linux/kd.h> #include <linux/kernel.h> #include <linux/kernel_read_file.h> #include <linux/errno.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/lsm_hooks.h> #include <linux/xattr.h> #include <linux/capability.h> #include <linux/unistd.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/proc_fs.h> #include <linux/swap.h> #include <linux/spinlock.h> #include <linux/syscalls.h> #include <linux/dcache.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/tty.h> #include <net/icmp.h> #include <net/ip.h> /* for local_port_range[] */ #include <net/tcp.h> /* struct or_callable used in sock_rcv_skb */ #include <net/inet_connection_sock.h> #include <net/net_namespace.h> #include <net/netlabel.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/interrupt.h> #include <linux/netdevice.h> /* for network interface checks */ #include <net/netlink.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/dccp.h> #include <linux/sctp.h> #include <net/sctp/structs.h> #include <linux/quota.h> #include <linux/un.h> /* for Unix socket types */ #include <net/af_unix.h> /* for Unix socket types */ #include <linux/parser.h> #include <linux/nfs_mount.h> #include <net/ipv6.h> #include <linux/hugetlb.h> #include <linux/personality.h> #include <linux/audit.h> #include <linux/string.h> #include <linux/mutex.h> #include <linux/posix-timers.h> #include <linux/syslog.h> #include <linux/user_namespace.h> #include <linux/export.h> #include <linux/msg.h> #include <linux/shm.h> #include <linux/bpf.h> #include <linux/kernfs.h> #include <linux/stringhash.h> /* for hashlen_string() */ #include <uapi/linux/mount.h> #include <linux/fsnotify.h> #include <linux/fanotify.h> #include <linux/io_uring.h> #include "avc.h" #include "objsec.h" #include "netif.h" #include "netnode.h" #include "netport.h" #include "ibpkey.h" #include "xfrm.h" #include "netlabel.h" #include "audit.h" #include "avc_ss.h" struct selinux_state selinux_state; /* SECMARK reference count */ static atomic_t selinux_secmark_refcount = ATOMIC_INIT(0); #ifdef CONFIG_SECURITY_SELINUX_DEVELOP static int selinux_enforcing_boot __initdata; static int __init enforcing_setup(char *str) { unsigned long enforcing; if (!kstrtoul(str, 0, &enforcing)) selinux_enforcing_boot = enforcing ? 1 : 0; return 1; } __setup("enforcing=", enforcing_setup); #else #define selinux_enforcing_boot 1 #endif int selinux_enabled_boot __initdata = 1; #ifdef CONFIG_SECURITY_SELINUX_BOOTPARAM static int __init selinux_enabled_setup(char *str) { unsigned long enabled; if (!kstrtoul(str, 0, &enabled)) selinux_enabled_boot = enabled ? 1 : 0; return 1; } __setup("selinux=", selinux_enabled_setup); #endif static unsigned int selinux_checkreqprot_boot = CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE; static int __init checkreqprot_setup(char *str) { unsigned long checkreqprot; if (!kstrtoul(str, 0, &checkreqprot)) { selinux_checkreqprot_boot = checkreqprot ? 1 : 0; if (checkreqprot) pr_err("SELinux: checkreqprot set to 1 via kernel parameter. This is deprecated and will be rejected in a future kernel release.\n"); } return 1; } __setup("checkreqprot=", checkreqprot_setup); /** * selinux_secmark_enabled - Check to see if SECMARK is currently enabled * * Description: * This function checks the SECMARK reference counter to see if any SECMARK * targets are currently configured, if the reference counter is greater than * zero SECMARK is considered to be enabled. Returns true (1) if SECMARK is * enabled, false (0) if SECMARK is disabled. If the always_check_network * policy capability is enabled, SECMARK is always considered enabled. * */ static int selinux_secmark_enabled(void) { return (selinux_policycap_alwaysnetwork() || atomic_read(&selinux_secmark_refcount)); } /** * selinux_peerlbl_enabled - Check to see if peer labeling is currently enabled * * Description: * This function checks if NetLabel or labeled IPSEC is enabled. Returns true * (1) if any are enabled or false (0) if neither are enabled. If the * always_check_network policy capability is enabled, peer labeling * is always considered enabled. * */ static int selinux_peerlbl_enabled(void) { return (selinux_policycap_alwaysnetwork() || netlbl_enabled() || selinux_xfrm_enabled()); } static int selinux_netcache_avc_callback(u32 event) { if (event == AVC_CALLBACK_RESET) { sel_netif_flush(); sel_netnode_flush(); sel_netport_flush(); synchronize_net(); } return 0; } static int selinux_lsm_notifier_avc_callback(u32 event) { if (event == AVC_CALLBACK_RESET) { sel_ib_pkey_flush(); call_blocking_lsm_notifier(LSM_POLICY_CHANGE, NULL); } return 0; } /* * initialise the security for the init task */ static void cred_init_security(void) { struct task_security_struct *tsec; tsec = selinux_cred(unrcu_pointer(current->real_cred)); tsec->osid = tsec->sid = SECINITSID_KERNEL; } /* * get the security ID of a set of credentials */ static inline u32 cred_sid(const struct cred *cred) { const struct task_security_struct *tsec; tsec = selinux_cred(cred); return tsec->sid; } /* * get the objective security ID of a task */ static inline u32 task_sid_obj(const struct task_struct *task) { u32 sid; rcu_read_lock(); sid = cred_sid(__task_cred(task)); rcu_read_unlock(); return sid; } static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dentry); /* * Try reloading inode security labels that have been marked as invalid. The * @may_sleep parameter indicates when sleeping and thus reloading labels is * allowed; when set to false, returns -ECHILD when the label is * invalid. The @dentry parameter should be set to a dentry of the inode. */ static int __inode_security_revalidate(struct inode *inode, struct dentry *dentry, bool may_sleep) { struct inode_security_struct *isec = selinux_inode(inode); might_sleep_if(may_sleep); if (selinux_initialized(&selinux_state) && isec->initialized != LABEL_INITIALIZED) { if (!may_sleep) return -ECHILD; /* * Try reloading the inode security label. This will fail if * @opt_dentry is NULL and no dentry for this inode can be * found; in that case, continue using the old label. */ inode_doinit_with_dentry(inode, dentry); } return 0; } static struct inode_security_struct *inode_security_novalidate(struct inode *inode) { return selinux_inode(inode); } static struct inode_security_struct *inode_security_rcu(struct inode *inode, bool rcu) { int error; error = __inode_security_revalidate(inode, NULL, !rcu); if (error) return ERR_PTR(error); return selinux_inode(inode); } /* * Get the security label of an inode. */ static struct inode_security_struct *inode_security(struct inode *inode) { __inode_security_revalidate(inode, NULL, true); return selinux_inode(inode); } static struct inode_security_struct *backing_inode_security_novalidate(struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); return selinux_inode(inode); } /* * Get the security label of a dentry's backing inode. */ static struct inode_security_struct *backing_inode_security(struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); __inode_security_revalidate(inode, dentry, true); return selinux_inode(inode); } static void inode_free_security(struct inode *inode) { struct inode_security_struct *isec = selinux_inode(inode); struct superblock_security_struct *sbsec; if (!isec) return; sbsec = selinux_superblock(inode->i_sb); /* * As not all inode security structures are in a list, we check for * empty list outside of the lock to make sure that we won't waste * time taking a lock doing nothing. * * The list_del_init() function can be safely called more than once. * It should not be possible for this function to be called with * concurrent list_add(), but for better safety against future changes * in the code, we use list_empty_careful() here. */ if (!list_empty_careful(&isec->list)) { spin_lock(&sbsec->isec_lock); list_del_init(&isec->list); spin_unlock(&sbsec->isec_lock); } } struct selinux_mnt_opts { u32 fscontext_sid; u32 context_sid; u32 rootcontext_sid; u32 defcontext_sid; }; static void selinux_free_mnt_opts(void *mnt_opts) { kfree(mnt_opts); } enum { Opt_error = -1, Opt_context = 0, Opt_defcontext = 1, Opt_fscontext = 2, Opt_rootcontext = 3, Opt_seclabel = 4, }; #define A(s, has_arg) {#s, sizeof(#s) - 1, Opt_##s, has_arg} static struct { const char *name; int len; int opt; bool has_arg; } tokens[] = { A(context, true), A(fscontext, true), A(defcontext, true), A(rootcontext, true), A(seclabel, false), }; #undef A static int match_opt_prefix(char *s, int l, char **arg) { int i; for (i = 0; i < ARRAY_SIZE(tokens); i++) { size_t len = tokens[i].len; if (len > l || memcmp(s, tokens[i].name, len)) continue; if (tokens[i].has_arg) { if (len == l || s[len] != '=') continue; *arg = s + len + 1; } else if (len != l) continue; return tokens[i].opt; } return Opt_error; } #define SEL_MOUNT_FAIL_MSG "SELinux: duplicate or incompatible mount options\n" static int may_context_mount_sb_relabel(u32 sid, struct superblock_security_struct *sbsec, const struct cred *cred) { const struct task_security_struct *tsec = selinux_cred(cred); int rc; rc = avc_has_perm(&selinux_state, tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__RELABELFROM, NULL); if (rc) return rc; rc = avc_has_perm(&selinux_state, tsec->sid, sid, SECCLASS_FILESYSTEM, FILESYSTEM__RELABELTO, NULL); return rc; } static int may_context_mount_inode_relabel(u32 sid, struct superblock_security_struct *sbsec, const struct cred *cred) { const struct task_security_struct *tsec = selinux_cred(cred); int rc; rc = avc_has_perm(&selinux_state, tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__RELABELFROM, NULL); if (rc) return rc; rc = avc_has_perm(&selinux_state, sid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__ASSOCIATE, NULL); return rc; } static int selinux_is_genfs_special_handling(struct super_block *sb) { /* Special handling. Genfs but also in-core setxattr handler */ return !strcmp(sb->s_type->name, "sysfs") || !strcmp(sb->s_type->name, "pstore") || !strcmp(sb->s_type->name, "debugfs") || !strcmp(sb->s_type->name, "tracefs") || !strcmp(sb->s_type->name, "rootfs") || (selinux_policycap_cgroupseclabel() && (!strcmp(sb->s_type->name, "cgroup") || !strcmp(sb->s_type->name, "cgroup2"))); } static int selinux_is_sblabel_mnt(struct super_block *sb) { struct superblock_security_struct *sbsec = selinux_superblock(sb); /* * IMPORTANT: Double-check logic in this function when adding a new * SECURITY_FS_USE_* definition! */ BUILD_BUG_ON(SECURITY_FS_USE_MAX != 7); switch (sbsec->behavior) { case SECURITY_FS_USE_XATTR: case SECURITY_FS_USE_TRANS: case SECURITY_FS_USE_TASK: case SECURITY_FS_USE_NATIVE: return 1; case SECURITY_FS_USE_GENFS: return selinux_is_genfs_special_handling(sb); /* Never allow relabeling on context mounts */ case SECURITY_FS_USE_MNTPOINT: case SECURITY_FS_USE_NONE: default: return 0; } } static int sb_check_xattr_support(struct super_block *sb) { struct superblock_security_struct *sbsec = selinux_superblock(sb); struct dentry *root = sb->s_root; struct inode *root_inode = d_backing_inode(root); u32 sid; int rc; /* * Make sure that the xattr handler exists and that no * error other than -ENODATA is returned by getxattr on * the root directory. -ENODATA is ok, as this may be * the first boot of the SELinux kernel before we have * assigned xattr values to the filesystem. */ if (!(root_inode->i_opflags & IOP_XATTR)) { pr_warn("SELinux: (dev %s, type %s) has no xattr support\n", sb->s_id, sb->s_type->name); goto fallback; } rc = __vfs_getxattr(root, root_inode, XATTR_NAME_SELINUX, NULL, 0); if (rc < 0 && rc != -ENODATA) { if (rc == -EOPNOTSUPP) { pr_warn("SELinux: (dev %s, type %s) has no security xattr handler\n", sb->s_id, sb->s_type->name); goto fallback; } else { pr_warn("SELinux: (dev %s, type %s) getxattr errno %d\n", sb->s_id, sb->s_type->name, -rc); return rc; } } return 0; fallback: /* No xattr support - try to fallback to genfs if possible. */ rc = security_genfs_sid(&selinux_state, sb->s_type->name, "/", SECCLASS_DIR, &sid); if (rc) return -EOPNOTSUPP; pr_warn("SELinux: (dev %s, type %s) falling back to genfs\n", sb->s_id, sb->s_type->name); sbsec->behavior = SECURITY_FS_USE_GENFS; sbsec->sid = sid; return 0; } static int sb_finish_set_opts(struct super_block *sb) { struct superblock_security_struct *sbsec = selinux_superblock(sb); struct dentry *root = sb->s_root; struct inode *root_inode = d_backing_inode(root); int rc = 0; if (sbsec->behavior == SECURITY_FS_USE_XATTR) { rc = sb_check_xattr_support(sb); if (rc) return rc; } sbsec->flags |= SE_SBINITIALIZED; /* * Explicitly set or clear SBLABEL_MNT. It's not sufficient to simply * leave the flag untouched because sb_clone_mnt_opts might be handing * us a superblock that needs the flag to be cleared. */ if (selinux_is_sblabel_mnt(sb)) sbsec->flags |= SBLABEL_MNT; else sbsec->flags &= ~SBLABEL_MNT; /* Initialize the root inode. */ rc = inode_doinit_with_dentry(root_inode, root); /* Initialize any other inodes associated with the superblock, e.g. inodes created prior to initial policy load or inodes created during get_sb by a pseudo filesystem that directly populates itself. */ spin_lock(&sbsec->isec_lock); while (!list_empty(&sbsec->isec_head)) { struct inode_security_struct *isec = list_first_entry(&sbsec->isec_head, struct inode_security_struct, list); struct inode *inode = isec->inode; list_del_init(&isec->list); spin_unlock(&sbsec->isec_lock); inode = igrab(inode); if (inode) { if (!IS_PRIVATE(inode)) inode_doinit_with_dentry(inode, NULL); iput(inode); } spin_lock(&sbsec->isec_lock); } spin_unlock(&sbsec->isec_lock); return rc; } static int bad_option(struct superblock_security_struct *sbsec, char flag, u32 old_sid, u32 new_sid) { char mnt_flags = sbsec->flags & SE_MNTMASK; /* check if the old mount command had the same options */ if (sbsec->flags & SE_SBINITIALIZED) if (!(sbsec->flags & flag) || (old_sid != new_sid)) return 1; /* check if we were passed the same options twice, * aka someone passed context=a,context=b */ if (!(sbsec->flags & SE_SBINITIALIZED)) if (mnt_flags & flag) return 1; return 0; } /* * Allow filesystems with binary mount data to explicitly set mount point * labeling information. */ static int selinux_set_mnt_opts(struct super_block *sb, void *mnt_opts, unsigned long kern_flags, unsigned long *set_kern_flags) { const struct cred *cred = current_cred(); struct superblock_security_struct *sbsec = selinux_superblock(sb); struct dentry *root = sb->s_root; struct selinux_mnt_opts *opts = mnt_opts; struct inode_security_struct *root_isec; u32 fscontext_sid = 0, context_sid = 0, rootcontext_sid = 0; u32 defcontext_sid = 0; int rc = 0; mutex_lock(&sbsec->lock); if (!selinux_initialized(&selinux_state)) { if (!opts) { /* Defer initialization until selinux_complete_init, after the initial policy is loaded and the security server is ready to handle calls. */ goto out; } rc = -EINVAL; pr_warn("SELinux: Unable to set superblock options " "before the security server is initialized\n"); goto out; } if (kern_flags && !set_kern_flags) { /* Specifying internal flags without providing a place to * place the results is not allowed */ rc = -EINVAL; goto out; } /* * Binary mount data FS will come through this function twice. Once * from an explicit call and once from the generic calls from the vfs. * Since the generic VFS calls will not contain any security mount data * we need to skip the double mount verification. * * This does open a hole in which we will not notice if the first * mount using this sb set explicit options and a second mount using * this sb does not set any security options. (The first options * will be used for both mounts) */ if ((sbsec->flags & SE_SBINITIALIZED) && (sb->s_type->fs_flags & FS_BINARY_MOUNTDATA) && !opts) goto out; root_isec = backing_inode_security_novalidate(root); /* * parse the mount options, check if they are valid sids. * also check if someone is trying to mount the same sb more * than once with different security options. */ if (opts) { if (opts->fscontext_sid) { fscontext_sid = opts->fscontext_sid; if (bad_option(sbsec, FSCONTEXT_MNT, sbsec->sid, fscontext_sid)) goto out_double_mount; sbsec->flags |= FSCONTEXT_MNT; } if (opts->context_sid) { context_sid = opts->context_sid; if (bad_option(sbsec, CONTEXT_MNT, sbsec->mntpoint_sid, context_sid)) goto out_double_mount; sbsec->flags |= CONTEXT_MNT; } if (opts->rootcontext_sid) { rootcontext_sid = opts->rootcontext_sid; if (bad_option(sbsec, ROOTCONTEXT_MNT, root_isec->sid, rootcontext_sid)) goto out_double_mount; sbsec->flags |= ROOTCONTEXT_MNT; } if (opts->defcontext_sid) { defcontext_sid = opts->defcontext_sid; if (bad_option(sbsec, DEFCONTEXT_MNT, sbsec->def_sid, defcontext_sid)) goto out_double_mount; sbsec->flags |= DEFCONTEXT_MNT; } } if (sbsec->flags & SE_SBINITIALIZED) { /* previously mounted with options, but not on this attempt? */ if ((sbsec->flags & SE_MNTMASK) && !opts) goto out_double_mount; rc = 0; goto out; } if (strcmp(sb->s_type->name, "proc") == 0) sbsec->flags |= SE_SBPROC | SE_SBGENFS; if (!strcmp(sb->s_type->name, "debugfs") || !strcmp(sb->s_type->name, "tracefs") || !strcmp(sb->s_type->name, "binder") || !strcmp(sb->s_type->name, "bpf") || !strcmp(sb->s_type->name, "pstore") || !strcmp(sb->s_type->name, "securityfs")) sbsec->flags |= SE_SBGENFS; if (!strcmp(sb->s_type->name, "sysfs") || !strcmp(sb->s_type->name, "cgroup") || !strcmp(sb->s_type->name, "cgroup2")) sbsec->flags |= SE_SBGENFS | SE_SBGENFS_XATTR; if (!sbsec->behavior) { /* * Determine the labeling behavior to use for this * filesystem type. */ rc = security_fs_use(&selinux_state, sb); if (rc) { pr_warn("%s: security_fs_use(%s) returned %d\n", __func__, sb->s_type->name, rc); goto out; } } /* * If this is a user namespace mount and the filesystem type is not * explicitly whitelisted, then no contexts are allowed on the command * line and security labels must be ignored. */ if (sb->s_user_ns != &init_user_ns && strcmp(sb->s_type->name, "tmpfs") && strcmp(sb->s_type->name, "ramfs") && strcmp(sb->s_type->name, "devpts") && strcmp(sb->s_type->name, "overlay")) { if (context_sid || fscontext_sid || rootcontext_sid || defcontext_sid) { rc = -EACCES; goto out; } if (sbsec->behavior == SECURITY_FS_USE_XATTR) { sbsec->behavior = SECURITY_FS_USE_MNTPOINT; rc = security_transition_sid(&selinux_state, current_sid(), current_sid(), SECCLASS_FILE, NULL, &sbsec->mntpoint_sid); if (rc) goto out; } goto out_set_opts; } /* sets the context of the superblock for the fs being mounted. */ if (fscontext_sid) { rc = may_context_mount_sb_relabel(fscontext_sid, sbsec, cred); if (rc) goto out; sbsec->sid = fscontext_sid; } /* * Switch to using mount point labeling behavior. * sets the label used on all file below the mountpoint, and will set * the superblock context if not already set. */ if (kern_flags & SECURITY_LSM_NATIVE_LABELS && !context_sid) { sbsec->behavior = SECURITY_FS_USE_NATIVE; *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS; } if (context_sid) { if (!fscontext_sid) { rc = may_context_mount_sb_relabel(context_sid, sbsec, cred); if (rc) goto out; sbsec->sid = context_sid; } else { rc = may_context_mount_inode_relabel(context_sid, sbsec, cred); if (rc) goto out; } if (!rootcontext_sid) rootcontext_sid = context_sid; sbsec->mntpoint_sid = context_sid; sbsec->behavior = SECURITY_FS_USE_MNTPOINT; } if (rootcontext_sid) { rc = may_context_mount_inode_relabel(rootcontext_sid, sbsec, cred); if (rc) goto out; root_isec->sid = rootcontext_sid; root_isec->initialized = LABEL_INITIALIZED; } if (defcontext_sid) { if (sbsec->behavior != SECURITY_FS_USE_XATTR && sbsec->behavior != SECURITY_FS_USE_NATIVE) { rc = -EINVAL; pr_warn("SELinux: defcontext option is " "invalid for this filesystem type\n"); goto out; } if (defcontext_sid != sbsec->def_sid) { rc = may_context_mount_inode_relabel(defcontext_sid, sbsec, cred); if (rc) goto out; } sbsec->def_sid = defcontext_sid; } out_set_opts: rc = sb_finish_set_opts(sb); out: mutex_unlock(&sbsec->lock); return rc; out_double_mount: rc = -EINVAL; pr_warn("SELinux: mount invalid. Same superblock, different " "security settings for (dev %s, type %s)\n", sb->s_id, sb->s_type->name); goto out; } static int selinux_cmp_sb_context(const struct super_block *oldsb, const struct super_block *newsb) { struct superblock_security_struct *old = selinux_superblock(oldsb); struct superblock_security_struct *new = selinux_superblock(newsb); char oldflags = old->flags & SE_MNTMASK; char newflags = new->flags & SE_MNTMASK; if (oldflags != newflags) goto mismatch; if ((oldflags & FSCONTEXT_MNT) && old->sid != new->sid) goto mismatch; if ((oldflags & CONTEXT_MNT) && old->mntpoint_sid != new->mntpoint_sid) goto mismatch; if ((oldflags & DEFCONTEXT_MNT) && old->def_sid != new->def_sid) goto mismatch; if (oldflags & ROOTCONTEXT_MNT) { struct inode_security_struct *oldroot = backing_inode_security(oldsb->s_root); struct inode_security_struct *newroot = backing_inode_security(newsb->s_root); if (oldroot->sid != newroot->sid) goto mismatch; } return 0; mismatch: pr_warn("SELinux: mount invalid. Same superblock, " "different security settings for (dev %s, " "type %s)\n", newsb->s_id, newsb->s_type->name); return -EBUSY; } static int selinux_sb_clone_mnt_opts(const struct super_block *oldsb, struct super_block *newsb, unsigned long kern_flags, unsigned long *set_kern_flags) { int rc = 0; const struct superblock_security_struct *oldsbsec = selinux_superblock(oldsb); struct superblock_security_struct *newsbsec = selinux_superblock(newsb); int set_fscontext = (oldsbsec->flags & FSCONTEXT_MNT); int set_context = (oldsbsec->flags & CONTEXT_MNT); int set_rootcontext = (oldsbsec->flags & ROOTCONTEXT_MNT); /* * if the parent was able to be mounted it clearly had no special lsm * mount options. thus we can safely deal with this superblock later */ if (!selinux_initialized(&selinux_state)) return 0; /* * Specifying internal flags without providing a place to * place the results is not allowed. */ if (kern_flags && !set_kern_flags) return -EINVAL; /* how can we clone if the old one wasn't set up?? */ BUG_ON(!(oldsbsec->flags & SE_SBINITIALIZED)); /* if fs is reusing a sb, make sure that the contexts match */ if (newsbsec->flags & SE_SBINITIALIZED) { if ((kern_flags & SECURITY_LSM_NATIVE_LABELS) && !set_context) *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS; return selinux_cmp_sb_context(oldsb, newsb); } mutex_lock(&newsbsec->lock); newsbsec->flags = oldsbsec->flags; newsbsec->sid = oldsbsec->sid; newsbsec->def_sid = oldsbsec->def_sid; newsbsec->behavior = oldsbsec->behavior; if (newsbsec->behavior == SECURITY_FS_USE_NATIVE && !(kern_flags & SECURITY_LSM_NATIVE_LABELS) && !set_context) { rc = security_fs_use(&selinux_state, newsb); if (rc) goto out; } if (kern_flags & SECURITY_LSM_NATIVE_LABELS && !set_context) { newsbsec->behavior = SECURITY_FS_USE_NATIVE; *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS; } if (set_context) { u32 sid = oldsbsec->mntpoint_sid; if (!set_fscontext) newsbsec->sid = sid; if (!set_rootcontext) { struct inode_security_struct *newisec = backing_inode_security(newsb->s_root); newisec->sid = sid; } newsbsec->mntpoint_sid = sid; } if (set_rootcontext) { const struct inode_security_struct *oldisec = backing_inode_security(oldsb->s_root); struct inode_security_struct *newisec = backing_inode_security(newsb->s_root); newisec->sid = oldisec->sid; } sb_finish_set_opts(newsb); out: mutex_unlock(&newsbsec->lock); return rc; } /* * NOTE: the caller is resposible for freeing the memory even if on error. */ static int selinux_add_opt(int token, const char *s, void **mnt_opts) { struct selinux_mnt_opts *opts = *mnt_opts; u32 *dst_sid; int rc; if (token == Opt_seclabel) /* eaten and completely ignored */ return 0; if (!s) return -EINVAL; if (!selinux_initialized(&selinux_state)) { pr_warn("SELinux: Unable to set superblock options before the security server is initialized\n"); return -EINVAL; } if (!opts) { opts = kzalloc(sizeof(*opts), GFP_KERNEL); if (!opts) return -ENOMEM; *mnt_opts = opts; } switch (token) { case Opt_context: if (opts->context_sid || opts->defcontext_sid) goto err; dst_sid = &opts->context_sid; break; case Opt_fscontext: if (opts->fscontext_sid) goto err; dst_sid = &opts->fscontext_sid; break; case Opt_rootcontext: if (opts->rootcontext_sid) goto err; dst_sid = &opts->rootcontext_sid; break; case Opt_defcontext: if (opts->context_sid || opts->defcontext_sid) goto err; dst_sid = &opts->defcontext_sid; break; default: WARN_ON(1); return -EINVAL; } rc = security_context_str_to_sid(&selinux_state, s, dst_sid, GFP_KERNEL); if (rc) pr_warn("SELinux: security_context_str_to_sid (%s) failed with errno=%d\n", s, rc); return rc; err: pr_warn(SEL_MOUNT_FAIL_MSG); return -EINVAL; } static int show_sid(struct seq_file *m, u32 sid) { char *context = NULL; u32 len; int rc; rc = security_sid_to_context(&selinux_state, sid, &context, &len); if (!rc) { bool has_comma = strchr(context, ','); seq_putc(m, '='); if (has_comma) seq_putc(m, '\"'); seq_escape(m, context, "\"\n\\"); if (has_comma) seq_putc(m, '\"'); } kfree(context); return rc; } static int selinux_sb_show_options(struct seq_file *m, struct super_block *sb) { struct superblock_security_struct *sbsec = selinux_superblock(sb); int rc; if (!(sbsec->flags & SE_SBINITIALIZED)) return 0; if (!selinux_initialized(&selinux_state)) return 0; if (sbsec->flags & FSCONTEXT_MNT) { seq_putc(m, ','); seq_puts(m, FSCONTEXT_STR); rc = show_sid(m, sbsec->sid); if (rc) return rc; } if (sbsec->flags & CONTEXT_MNT) { seq_putc(m, ','); seq_puts(m, CONTEXT_STR); rc = show_sid(m, sbsec->mntpoint_sid); if (rc) return rc; } if (sbsec->flags & DEFCONTEXT_MNT) { seq_putc(m, ','); seq_puts(m, DEFCONTEXT_STR); rc = show_sid(m, sbsec->def_sid); if (rc) return rc; } if (sbsec->flags & ROOTCONTEXT_MNT) { struct dentry *root = sb->s_root; struct inode_security_struct *isec = backing_inode_security(root); seq_putc(m, ','); seq_puts(m, ROOTCONTEXT_STR); rc = show_sid(m, isec->sid); if (rc) return rc; } if (sbsec->flags & SBLABEL_MNT) { seq_putc(m, ','); seq_puts(m, SECLABEL_STR); } return 0; } static inline u16 inode_mode_to_security_class(umode_t mode) { switch (mode & S_IFMT) { case S_IFSOCK: return SECCLASS_SOCK_FILE; case S_IFLNK: return SECCLASS_LNK_FILE; case S_IFREG: return SECCLASS_FILE; case S_IFBLK: return SECCLASS_BLK_FILE; case S_IFDIR: return SECCLASS_DIR; case S_IFCHR: return SECCLASS_CHR_FILE; case S_IFIFO: return SECCLASS_FIFO_FILE; } return SECCLASS_FILE; } static inline int default_protocol_stream(int protocol) { return (protocol == IPPROTO_IP || protocol == IPPROTO_TCP || protocol == IPPROTO_MPTCP); } static inline int default_protocol_dgram(int protocol) { return (protocol == IPPROTO_IP || protocol == IPPROTO_UDP); } static inline u16 socket_type_to_security_class(int family, int type, int protocol) { int extsockclass = selinux_policycap_extsockclass(); switch (family) { case PF_UNIX: switch (type) { case SOCK_STREAM: case SOCK_SEQPACKET: return SECCLASS_UNIX_STREAM_SOCKET; case SOCK_DGRAM: case SOCK_RAW: return SECCLASS_UNIX_DGRAM_SOCKET; } break; case PF_INET: case PF_INET6: switch (type) { case SOCK_STREAM: case SOCK_SEQPACKET: if (default_protocol_stream(protocol)) return SECCLASS_TCP_SOCKET; else if (extsockclass && protocol == IPPROTO_SCTP) return SECCLASS_SCTP_SOCKET; else return SECCLASS_RAWIP_SOCKET; case SOCK_DGRAM: if (default_protocol_dgram(protocol)) return SECCLASS_UDP_SOCKET; else if (extsockclass && (protocol == IPPROTO_ICMP || protocol == IPPROTO_ICMPV6)) return SECCLASS_ICMP_SOCKET; else return SECCLASS_RAWIP_SOCKET; case SOCK_DCCP: return SECCLASS_DCCP_SOCKET; default: return SECCLASS_RAWIP_SOCKET; } break; case PF_NETLINK: switch (protocol) { case NETLINK_ROUTE: return SECCLASS_NETLINK_ROUTE_SOCKET; case NETLINK_SOCK_DIAG: return SECCLASS_NETLINK_TCPDIAG_SOCKET; case NETLINK_NFLOG: return SECCLASS_NETLINK_NFLOG_SOCKET; case NETLINK_XFRM: return SECCLASS_NETLINK_XFRM_SOCKET; case NETLINK_SELINUX: return SECCLASS_NETLINK_SELINUX_SOCKET; case NETLINK_ISCSI: return SECCLASS_NETLINK_ISCSI_SOCKET; case NETLINK_AUDIT: return SECCLASS_NETLINK_AUDIT_SOCKET; case NETLINK_FIB_LOOKUP: return SECCLASS_NETLINK_FIB_LOOKUP_SOCKET; case NETLINK_CONNECTOR: return SECCLASS_NETLINK_CONNECTOR_SOCKET; case NETLINK_NETFILTER: return SECCLASS_NETLINK_NETFILTER_SOCKET; case NETLINK_DNRTMSG: return SECCLASS_NETLINK_DNRT_SOCKET; case NETLINK_KOBJECT_UEVENT: return SECCLASS_NETLINK_KOBJECT_UEVENT_SOCKET; case NETLINK_GENERIC: return SECCLASS_NETLINK_GENERIC_SOCKET; case NETLINK_SCSITRANSPORT: return SECCLASS_NETLINK_SCSITRANSPORT_SOCKET; case NETLINK_RDMA: return SECCLASS_NETLINK_RDMA_SOCKET; case NETLINK_CRYPTO: return SECCLASS_NETLINK_CRYPTO_SOCKET; default: return SECCLASS_NETLINK_SOCKET; } case PF_PACKET: return SECCLASS_PACKET_SOCKET; case PF_KEY: return SECCLASS_KEY_SOCKET; case PF_APPLETALK: return SECCLASS_APPLETALK_SOCKET; } if (extsockclass) { switch (family) { case PF_AX25: return SECCLASS_AX25_SOCKET; case PF_IPX: return SECCLASS_IPX_SOCKET; case PF_NETROM: return SECCLASS_NETROM_SOCKET; case PF_ATMPVC: return SECCLASS_ATMPVC_SOCKET; case PF_X25: return SECCLASS_X25_SOCKET; case PF_ROSE: return SECCLASS_ROSE_SOCKET; case PF_DECnet: return SECCLASS_DECNET_SOCKET; case PF_ATMSVC: return SECCLASS_ATMSVC_SOCKET; case PF_RDS: return SECCLASS_RDS_SOCKET; case PF_IRDA: return SECCLASS_IRDA_SOCKET; case PF_PPPOX: return SECCLASS_PPPOX_SOCKET; case PF_LLC: return SECCLASS_LLC_SOCKET; case PF_CAN: return SECCLASS_CAN_SOCKET; case PF_TIPC: return SECCLASS_TIPC_SOCKET; case PF_BLUETOOTH: return SECCLASS_BLUETOOTH_SOCKET; case PF_IUCV: return SECCLASS_IUCV_SOCKET; case PF_RXRPC: return SECCLASS_RXRPC_SOCKET; case PF_ISDN: return SECCLASS_ISDN_SOCKET; case PF_PHONET: return SECCLASS_PHONET_SOCKET; case PF_IEEE802154: return SECCLASS_IEEE802154_SOCKET; case PF_CAIF: return SECCLASS_CAIF_SOCKET; case PF_ALG: return SECCLASS_ALG_SOCKET; case PF_NFC: return SECCLASS_NFC_SOCKET; case PF_VSOCK: return SECCLASS_VSOCK_SOCKET; case PF_KCM: return SECCLASS_KCM_SOCKET; case PF_QIPCRTR: return SECCLASS_QIPCRTR_SOCKET; case PF_SMC: return SECCLASS_SMC_SOCKET; case PF_XDP: return SECCLASS_XDP_SOCKET; case PF_MCTP: return SECCLASS_MCTP_SOCKET; #if PF_MAX > 46 #error New address family defined, please update this function. #endif } } return SECCLASS_SOCKET; } static int selinux_genfs_get_sid(struct dentry *dentry, u16 tclass, u16 flags, u32 *sid) { int rc; struct super_block *sb = dentry->d_sb; char *buffer, *path; buffer = (char *)__get_free_page(GFP_KERNEL); if (!buffer) return -ENOMEM; path = dentry_path_raw(dentry, buffer, PAGE_SIZE); if (IS_ERR(path)) rc = PTR_ERR(path); else { if (flags & SE_SBPROC) { /* each process gets a /proc/PID/ entry. Strip off the * PID part to get a valid selinux labeling. * e.g. /proc/1/net/rpc/nfs -> /net/rpc/nfs */ while (path[1] >= '0' && path[1] <= '9') { path[1] = '/'; path++; } } rc = security_genfs_sid(&selinux_state, sb->s_type->name, path, tclass, sid); if (rc == -ENOENT) { /* No match in policy, mark as unlabeled. */ *sid = SECINITSID_UNLABELED; rc = 0; } } free_page((unsigned long)buffer); return rc; } static int inode_doinit_use_xattr(struct inode *inode, struct dentry *dentry, u32 def_sid, u32 *sid) { #define INITCONTEXTLEN 255 char *context; unsigned int len; int rc; len = INITCONTEXTLEN; context = kmalloc(len + 1, GFP_NOFS); if (!context) return -ENOMEM; context[len] = '\0'; rc = __vfs_getxattr(dentry, inode, XATTR_NAME_SELINUX, context, len); if (rc == -ERANGE) { kfree(context); /* Need a larger buffer. Query for the right size. */ rc = __vfs_getxattr(dentry, inode, XATTR_NAME_SELINUX, NULL, 0); if (rc < 0) return rc; len = rc; context = kmalloc(len + 1, GFP_NOFS); if (!context) return -ENOMEM; context[len] = '\0'; rc = __vfs_getxattr(dentry, inode, XATTR_NAME_SELINUX, context, len); } if (rc < 0) { kfree(context); if (rc != -ENODATA) { pr_warn("SELinux: %s: getxattr returned %d for dev=%s ino=%ld\n", __func__, -rc, inode->i_sb->s_id, inode->i_ino); return rc; } *sid = def_sid; return 0; } rc = security_context_to_sid_default(&selinux_state, context, rc, sid, def_sid, GFP_NOFS); if (rc) { char *dev = inode->i_sb->s_id; unsigned long ino = inode->i_ino; if (rc == -EINVAL) { pr_notice_ratelimited("SELinux: inode=%lu on dev=%s was found to have an invalid context=%s. This indicates you may need to relabel the inode or the filesystem in question.\n", ino, dev, context); } else { pr_warn("SELinux: %s: context_to_sid(%s) returned %d for dev=%s ino=%ld\n", __func__, context, -rc, dev, ino); } } kfree(context); return 0; } /* The inode's security attributes must be initialized before first use. */ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dentry) { struct superblock_security_struct *sbsec = NULL; struct inode_security_struct *isec = selinux_inode(inode); u32 task_sid, sid = 0; u16 sclass; struct dentry *dentry; int rc = 0; if (isec->initialized == LABEL_INITIALIZED) return 0; spin_lock(&isec->lock); if (isec->initialized == LABEL_INITIALIZED) goto out_unlock; if (isec->sclass == SECCLASS_FILE) isec->sclass = inode_mode_to_security_class(inode->i_mode); sbsec = selinux_superblock(inode->i_sb); if (!(sbsec->flags & SE_SBINITIALIZED)) { /* Defer initialization until selinux_complete_init, after the initial policy is loaded and the security server is ready to handle calls. */ spin_lock(&sbsec->isec_lock); if (list_empty(&isec->list)) list_add(&isec->list, &sbsec->isec_head); spin_unlock(&sbsec->isec_lock); goto out_unlock; } sclass = isec->sclass; task_sid = isec->task_sid; sid = isec->sid; isec->initialized = LABEL_PENDING; spin_unlock(&isec->lock); switch (sbsec->behavior) { case SECURITY_FS_USE_NATIVE: break; case SECURITY_FS_USE_XATTR: if (!(inode->i_opflags & IOP_XATTR)) { sid = sbsec->def_sid; break; } /* Need a dentry, since the xattr API requires one. Life would be simpler if we could just pass the inode. */ if (opt_dentry) { /* Called from d_instantiate or d_splice_alias. */ dentry = dget(opt_dentry); } else { /* * Called from selinux_complete_init, try to find a dentry. * Some filesystems really want a connected one, so try * that first. We could split SECURITY_FS_USE_XATTR in * two, depending upon that... */ dentry = d_find_alias(inode); if (!dentry) dentry = d_find_any_alias(inode); } if (!dentry) { /* * this is can be hit on boot when a file is accessed * before the policy is loaded. When we load policy we * may find inodes that have no dentry on the * sbsec->isec_head list. No reason to complain as these * will get fixed up the next time we go through * inode_doinit with a dentry, before these inodes could * be used again by userspace. */ goto out_invalid; } rc = inode_doinit_use_xattr(inode, dentry, sbsec->def_sid, &sid); dput(dentry); if (rc) goto out; break; case SECURITY_FS_USE_TASK: sid = task_sid; break; case SECURITY_FS_USE_TRANS: /* Default to the fs SID. */ sid = sbsec->sid; /* Try to obtain a transition SID. */ rc = security_transition_sid(&selinux_state, task_sid, sid, sclass, NULL, &sid); if (rc) goto out; break; case SECURITY_FS_USE_MNTPOINT: sid = sbsec->mntpoint_sid; break; default: /* Default to the fs superblock SID. */ sid = sbsec->sid; if ((sbsec->flags & SE_SBGENFS) && (!S_ISLNK(inode->i_mode) || selinux_policycap_genfs_seclabel_symlinks())) { /* We must have a dentry to determine the label on * procfs inodes */ if (opt_dentry) { /* Called from d_instantiate or * d_splice_alias. */ dentry = dget(opt_dentry); } else { /* Called from selinux_complete_init, try to * find a dentry. Some filesystems really want * a connected one, so try that first. */ dentry = d_find_alias(inode); if (!dentry) dentry = d_find_any_alias(inode); } /* * This can be hit on boot when a file is accessed * before the policy is loaded. When we load policy we * may find inodes that have no dentry on the * sbsec->isec_head list. No reason to complain as * these will get fixed up the next time we go through * inode_doinit() with a dentry, before these inodes * could be used again by userspace. */ if (!dentry) goto out_invalid; rc = selinux_genfs_get_sid(dentry, sclass, sbsec->flags, &sid); if (rc) { dput(dentry); goto out; } if ((sbsec->flags & SE_SBGENFS_XATTR) && (inode->i_opflags & IOP_XATTR)) { rc = inode_doinit_use_xattr(inode, dentry, sid, &sid); if (rc) { dput(dentry); goto out; } } dput(dentry); } break; } out: spin_lock(&isec->lock); if (isec->initialized == LABEL_PENDING) { if (rc) { isec->initialized = LABEL_INVALID; goto out_unlock; } isec->initialized = LABEL_INITIALIZED; isec->sid = sid; } out_unlock: spin_unlock(&isec->lock); return rc; out_invalid: spin_lock(&isec->lock); if (isec->initialized == LABEL_PENDING) { isec->initialized = LABEL_INVALID; isec->sid = sid; } spin_unlock(&isec->lock); return 0; } /* Convert a Linux signal to an access vector. */ static inline u32 signal_to_av(int sig) { u32 perm = 0; switch (sig) { case SIGCHLD: /* Commonly granted from child to parent. */ perm = PROCESS__SIGCHLD; break; case SIGKILL: /* Cannot be caught or ignored */ perm = PROCESS__SIGKILL; break; case SIGSTOP: /* Cannot be caught or ignored */ perm = PROCESS__SIGSTOP; break; default: /* All other signals. */ perm = PROCESS__SIGNAL; break; } return perm; } #if CAP_LAST_CAP > 63 #error Fix SELinux to handle capabilities > 63. #endif /* Check whether a task is allowed to use a capability. */ static int cred_has_capability(const struct cred *cred, int cap, unsigned int opts, bool initns) { struct common_audit_data ad; struct av_decision avd; u16 sclass; u32 sid = cred_sid(cred); u32 av = CAP_TO_MASK(cap); int rc; ad.type = LSM_AUDIT_DATA_CAP; ad.u.cap = cap; switch (CAP_TO_INDEX(cap)) { case 0: sclass = initns ? SECCLASS_CAPABILITY : SECCLASS_CAP_USERNS; break; case 1: sclass = initns ? SECCLASS_CAPABILITY2 : SECCLASS_CAP2_USERNS; break; default: pr_err("SELinux: out of range capability %d\n", cap); BUG(); return -EINVAL; } rc = avc_has_perm_noaudit(&selinux_state, sid, sid, sclass, av, 0, &avd); if (!(opts & CAP_OPT_NOAUDIT)) { int rc2 = avc_audit(&selinux_state, sid, sid, sclass, av, &avd, rc, &ad); if (rc2) return rc2; } return rc; } /* Check whether a task has a particular permission to an inode. The 'adp' parameter is optional and allows other audit data to be passed (e.g. the dentry). */ static int inode_has_perm(const struct cred *cred, struct inode *inode, u32 perms, struct common_audit_data *adp) { struct inode_security_struct *isec; u32 sid; validate_creds(cred); if (unlikely(IS_PRIVATE(inode))) return 0; sid = cred_sid(cred); isec = selinux_inode(inode); return avc_has_perm(&selinux_state, sid, isec->sid, isec->sclass, perms, adp); } /* Same as inode_has_perm, but pass explicit audit data containing the dentry to help the auditing code to more easily generate the pathname if needed. */ static inline int dentry_has_perm(const struct cred *cred, struct dentry *dentry, u32 av) { struct inode *inode = d_backing_inode(dentry); struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; __inode_security_revalidate(inode, dentry, true); return inode_has_perm(cred, inode, av, &ad); } /* Same as inode_has_perm, but pass explicit audit data containing the path to help the auditing code to more easily generate the pathname if needed. */ static inline int path_has_perm(const struct cred *cred, const struct path *path, u32 av) { struct inode *inode = d_backing_inode(path->dentry); struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_PATH; ad.u.path = *path; __inode_security_revalidate(inode, path->dentry, true); return inode_has_perm(cred, inode, av, &ad); } /* Same as path_has_perm, but uses the inode from the file struct. */ static inline int file_path_has_perm(const struct cred *cred, struct file *file, u32 av) { struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_FILE; ad.u.file = file; return inode_has_perm(cred, file_inode(file), av, &ad); } #ifdef CONFIG_BPF_SYSCALL static int bpf_fd_pass(struct file *file, u32 sid); #endif /* Check whether a task can use an open file descriptor to access an inode in a given way. Check access to the descriptor itself, and then use dentry_has_perm to check a particular permission to the file. Access to the descriptor is implicitly granted if it has the same SID as the process. If av is zero, then access to the file is not checked, e.g. for cases where only the descriptor is affected like seek. */ static int file_has_perm(const struct cred *cred, struct file *file, u32 av) { struct file_security_struct *fsec = selinux_file(file); struct inode *inode = file_inode(file); struct common_audit_data ad; u32 sid = cred_sid(cred); int rc; ad.type = LSM_AUDIT_DATA_FILE; ad.u.file = file; if (sid != fsec->sid) { rc = avc_has_perm(&selinux_state, sid, fsec->sid, SECCLASS_FD, FD__USE, &ad); if (rc) goto out; } #ifdef CONFIG_BPF_SYSCALL rc = bpf_fd_pass(file, cred_sid(cred)); if (rc) return rc; #endif /* av is zero if only checking access to the descriptor. */ rc = 0; if (av) rc = inode_has_perm(cred, inode, av, &ad); out: return rc; } /* * Determine the label for an inode that might be unioned. */ static int selinux_determine_inode_label(const struct task_security_struct *tsec, struct inode *dir, const struct qstr *name, u16 tclass, u32 *_new_isid) { const struct superblock_security_struct *sbsec = selinux_superblock(dir->i_sb); if ((sbsec->flags & SE_SBINITIALIZED) && (sbsec->behavior == SECURITY_FS_USE_MNTPOINT)) { *_new_isid = sbsec->mntpoint_sid; } else if ((sbsec->flags & SBLABEL_MNT) && tsec->create_sid) { *_new_isid = tsec->create_sid; } else { const struct inode_security_struct *dsec = inode_security(dir); return security_transition_sid(&selinux_state, tsec->sid, dsec->sid, tclass, name, _new_isid); } return 0; } /* Check whether a task can create a file. */ static int may_create(struct inode *dir, struct dentry *dentry, u16 tclass) { const struct task_security_struct *tsec = selinux_cred(current_cred()); struct inode_security_struct *dsec; struct superblock_security_struct *sbsec; u32 sid, newsid; struct common_audit_data ad; int rc; dsec = inode_security(dir); sbsec = selinux_superblock(dir->i_sb); sid = tsec->sid; ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; rc = avc_has_perm(&selinux_state, sid, dsec->sid, SECCLASS_DIR, DIR__ADD_NAME | DIR__SEARCH, &ad); if (rc) return rc; rc = selinux_determine_inode_label(tsec, dir, &dentry->d_name, tclass, &newsid); if (rc) return rc; rc = avc_has_perm(&selinux_state, sid, newsid, tclass, FILE__CREATE, &ad); if (rc) return rc; return avc_has_perm(&selinux_state, newsid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__ASSOCIATE, &ad); } #define MAY_LINK 0 #define MAY_UNLINK 1 #define MAY_RMDIR 2 /* Check whether a task can link, unlink, or rmdir a file/directory. */ static int may_link(struct inode *dir, struct dentry *dentry, int kind) { struct inode_security_struct *dsec, *isec; struct common_audit_data ad; u32 sid = current_sid(); u32 av; int rc; dsec = inode_security(dir); isec = backing_inode_security(dentry); ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; av = DIR__SEARCH; av |= (kind ? DIR__REMOVE_NAME : DIR__ADD_NAME); rc = avc_has_perm(&selinux_state, sid, dsec->sid, SECCLASS_DIR, av, &ad); if (rc) return rc; switch (kind) { case MAY_LINK: av = FILE__LINK; break; case MAY_UNLINK: av = FILE__UNLINK; break; case MAY_RMDIR: av = DIR__RMDIR; break; default: pr_warn("SELinux: %s: unrecognized kind %d\n", __func__, kind); return 0; } rc = avc_has_perm(&selinux_state, sid, isec->sid, isec->sclass, av, &ad); return rc; } static inline int may_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct inode_security_struct *old_dsec, *new_dsec, *old_isec, *new_isec; struct common_audit_data ad; u32 sid = current_sid(); u32 av; int old_is_dir, new_is_dir; int rc; old_dsec = inode_security(old_dir); old_isec = backing_inode_security(old_dentry); old_is_dir = d_is_dir(old_dentry); new_dsec = inode_security(new_dir); ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = old_dentry; rc = avc_has_perm(&selinux_state, sid, old_dsec->sid, SECCLASS_DIR, DIR__REMOVE_NAME | DIR__SEARCH, &ad); if (rc) return rc; rc = avc_has_perm(&selinux_state, sid, old_isec->sid, old_isec->sclass, FILE__RENAME, &ad); if (rc) return rc; if (old_is_dir && new_dir != old_dir) { rc = avc_has_perm(&selinux_state, sid, old_isec->sid, old_isec->sclass, DIR__REPARENT, &ad); if (rc) return rc; } ad.u.dentry = new_dentry; av = DIR__ADD_NAME | DIR__SEARCH; if (d_is_positive(new_dentry)) av |= DIR__REMOVE_NAME; rc = avc_has_perm(&selinux_state, sid, new_dsec->sid, SECCLASS_DIR, av, &ad); if (rc) return rc; if (d_is_positive(new_dentry)) { new_isec = backing_inode_security(new_dentry); new_is_dir = d_is_dir(new_dentry); rc = avc_has_perm(&selinux_state, sid, new_isec->sid, new_isec->sclass, (new_is_dir ? DIR__RMDIR : FILE__UNLINK), &ad); if (rc) return rc; } return 0; } /* Check whether a task can perform a filesystem operation. */ static int superblock_has_perm(const struct cred *cred, struct super_block *sb, u32 perms, struct common_audit_data *ad) { struct superblock_security_struct *sbsec; u32 sid = cred_sid(cred); sbsec = selinux_superblock(sb); return avc_has_perm(&selinux_state, sid, sbsec->sid, SECCLASS_FILESYSTEM, perms, ad); } /* Convert a Linux mode and permission mask to an access vector. */ static inline u32 file_mask_to_av(int mode, int mask) { u32 av = 0; if (!S_ISDIR(mode)) { if (mask & MAY_EXEC) av |= FILE__EXECUTE; if (mask & MAY_READ) av |= FILE__READ; if (mask & MAY_APPEND) av |= FILE__APPEND; else if (mask & MAY_WRITE) av |= FILE__WRITE; } else { if (mask & MAY_EXEC) av |= DIR__SEARCH; if (mask & MAY_WRITE) av |= DIR__WRITE; if (mask & MAY_READ) av |= DIR__READ; } return av; } /* Convert a Linux file to an access vector. */ static inline u32 file_to_av(struct file *file) { u32 av = 0; if (file->f_mode & FMODE_READ) av |= FILE__READ; if (file->f_mode & FMODE_WRITE) { if (file->f_flags & O_APPEND) av |= FILE__APPEND; else av |= FILE__WRITE; } if (!av) { /* * Special file opened with flags 3 for ioctl-only use. */ av = FILE__IOCTL; } return av; } /* * Convert a file to an access vector and include the correct * open permission. */ static inline u32 open_file_to_av(struct file *file) { u32 av = file_to_av(file); struct inode *inode = file_inode(file); if (selinux_policycap_openperm() && inode->i_sb->s_magic != SOCKFS_MAGIC) av |= FILE__OPEN; return av; } /* Hook functions begin here. */ static int selinux_binder_set_context_mgr(const struct cred *mgr) { return avc_has_perm(&selinux_state, current_sid(), cred_sid(mgr), SECCLASS_BINDER, BINDER__SET_CONTEXT_MGR, NULL); } static int selinux_binder_transaction(const struct cred *from, const struct cred *to) { u32 mysid = current_sid(); u32 fromsid = cred_sid(from); u32 tosid = cred_sid(to); int rc; if (mysid != fromsid) { rc = avc_has_perm(&selinux_state, mysid, fromsid, SECCLASS_BINDER, BINDER__IMPERSONATE, NULL); if (rc) return rc; } return avc_has_perm(&selinux_state, fromsid, tosid, SECCLASS_BINDER, BINDER__CALL, NULL); } static int selinux_binder_transfer_binder(const struct cred *from, const struct cred *to) { return avc_has_perm(&selinux_state, cred_sid(from), cred_sid(to), SECCLASS_BINDER, BINDER__TRANSFER, NULL); } static int selinux_binder_transfer_file(const struct cred *from, const struct cred *to, struct file *file) { u32 sid = cred_sid(to); struct file_security_struct *fsec = selinux_file(file); struct dentry *dentry = file->f_path.dentry; struct inode_security_struct *isec; struct common_audit_data ad; int rc; ad.type = LSM_AUDIT_DATA_PATH; ad.u.path = file->f_path; if (sid != fsec->sid) { rc = avc_has_perm(&selinux_state, sid, fsec->sid, SECCLASS_FD, FD__USE, &ad); if (rc) return rc; } #ifdef CONFIG_BPF_SYSCALL rc = bpf_fd_pass(file, sid); if (rc) return rc; #endif if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; isec = backing_inode_security(dentry); return avc_has_perm(&selinux_state, sid, isec->sid, isec->sclass, file_to_av(file), &ad); } static int selinux_ptrace_access_check(struct task_struct *child, unsigned int mode) { u32 sid = current_sid(); u32 csid = task_sid_obj(child); if (mode & PTRACE_MODE_READ) return avc_has_perm(&selinux_state, sid, csid, SECCLASS_FILE, FILE__READ, NULL); return avc_has_perm(&selinux_state, sid, csid, SECCLASS_PROCESS, PROCESS__PTRACE, NULL); } static int selinux_ptrace_traceme(struct task_struct *parent) { return avc_has_perm(&selinux_state, task_sid_obj(parent), task_sid_obj(current), SECCLASS_PROCESS, PROCESS__PTRACE, NULL); } static int selinux_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { return avc_has_perm(&selinux_state, current_sid(), task_sid_obj(target), SECCLASS_PROCESS, PROCESS__GETCAP, NULL); } static int selinux_capset(struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted) { return avc_has_perm(&selinux_state, cred_sid(old), cred_sid(new), SECCLASS_PROCESS, PROCESS__SETCAP, NULL); } /* * (This comment used to live with the selinux_task_setuid hook, * which was removed). * * Since setuid only affects the current process, and since the SELinux * controls are not based on the Linux identity attributes, SELinux does not * need to control this operation. However, SELinux does control the use of * the CAP_SETUID and CAP_SETGID capabilities using the capable hook. */ static int selinux_capable(const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts) { return cred_has_capability(cred, cap, opts, ns == &init_user_ns); } static int selinux_quotactl(int cmds, int type, int id, struct super_block *sb) { const struct cred *cred = current_cred(); int rc = 0; if (!sb) return 0; switch (cmds) { case Q_SYNC: case Q_QUOTAON: case Q_QUOTAOFF: case Q_SETINFO: case Q_SETQUOTA: case Q_XQUOTAOFF: case Q_XQUOTAON: case Q_XSETQLIM: rc = superblock_has_perm(cred, sb, FILESYSTEM__QUOTAMOD, NULL); break; case Q_GETFMT: case Q_GETINFO: case Q_GETQUOTA: case Q_XGETQUOTA: case Q_XGETQSTAT: case Q_XGETQSTATV: case Q_XGETNEXTQUOTA: rc = superblock_has_perm(cred, sb, FILESYSTEM__QUOTAGET, NULL); break; default: rc = 0; /* let the kernel handle invalid cmds */ break; } return rc; } static int selinux_quota_on(struct dentry *dentry) { const struct cred *cred = current_cred(); return dentry_has_perm(cred, dentry, FILE__QUOTAON); } static int selinux_syslog(int type) { switch (type) { case SYSLOG_ACTION_READ_ALL: /* Read last kernel messages */ case SYSLOG_ACTION_SIZE_BUFFER: /* Return size of the log buffer */ return avc_has_perm(&selinux_state, current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__SYSLOG_READ, NULL); case SYSLOG_ACTION_CONSOLE_OFF: /* Disable logging to console */ case SYSLOG_ACTION_CONSOLE_ON: /* Enable logging to console */ /* Set level of messages printed to console */ case SYSLOG_ACTION_CONSOLE_LEVEL: return avc_has_perm(&selinux_state, current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__SYSLOG_CONSOLE, NULL); } /* All other syslog types */ return avc_has_perm(&selinux_state, current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__SYSLOG_MOD, NULL); } /* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to * succeed and -ENOMEM implies there is not. * * Do not audit the selinux permission check, as this is applied to all * processes that allocate mappings. */ static int selinux_vm_enough_memory(struct mm_struct *mm, long pages) { int rc, cap_sys_admin = 0; rc = cred_has_capability(current_cred(), CAP_SYS_ADMIN, CAP_OPT_NOAUDIT, true); if (rc == 0) cap_sys_admin = 1; return cap_sys_admin; } /* binprm security operations */ static u32 ptrace_parent_sid(void) { u32 sid = 0; struct task_struct *tracer; rcu_read_lock(); tracer = ptrace_parent(current); if (tracer) sid = task_sid_obj(tracer); rcu_read_unlock(); return sid; } static int check_nnp_nosuid(const struct linux_binprm *bprm, const struct task_security_struct *old_tsec, const struct task_security_struct *new_tsec) { int nnp = (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS); int nosuid = !mnt_may_suid(bprm->file->f_path.mnt); int rc; u32 av; if (!nnp && !nosuid) return 0; /* neither NNP nor nosuid */ if (new_tsec->sid == old_tsec->sid) return 0; /* No change in credentials */ /* * If the policy enables the nnp_nosuid_transition policy capability, * then we permit transitions under NNP or nosuid if the * policy allows the corresponding permission between * the old and new contexts. */ if (selinux_policycap_nnp_nosuid_transition()) { av = 0; if (nnp) av |= PROCESS2__NNP_TRANSITION; if (nosuid) av |= PROCESS2__NOSUID_TRANSITION; rc = avc_has_perm(&selinux_state, old_tsec->sid, new_tsec->sid, SECCLASS_PROCESS2, av, NULL); if (!rc) return 0; } /* * We also permit NNP or nosuid transitions to bounded SIDs, * i.e. SIDs that are guaranteed to only be allowed a subset * of the permissions of the current SID. */ rc = security_bounded_transition(&selinux_state, old_tsec->sid, new_tsec->sid); if (!rc) return 0; /* * On failure, preserve the errno values for NNP vs nosuid. * NNP: Operation not permitted for caller. * nosuid: Permission denied to file. */ if (nnp) return -EPERM; return -EACCES; } static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) { const struct task_security_struct *old_tsec; struct task_security_struct *new_tsec; struct inode_security_struct *isec; struct common_audit_data ad; struct inode *inode = file_inode(bprm->file); int rc; /* SELinux context only depends on initial program or script and not * the script interpreter */ old_tsec = selinux_cred(current_cred()); new_tsec = selinux_cred(bprm->cred); isec = inode_security(inode); /* Default to the current task SID. */ new_tsec->sid = old_tsec->sid; new_tsec->osid = old_tsec->sid; /* Reset fs, key, and sock SIDs on execve. */ new_tsec->create_sid = 0; new_tsec->keycreate_sid = 0; new_tsec->sockcreate_sid = 0; if (old_tsec->exec_sid) { new_tsec->sid = old_tsec->exec_sid; /* Reset exec SID on execve. */ new_tsec->exec_sid = 0; /* Fail on NNP or nosuid if not an allowed transition. */ rc = check_nnp_nosuid(bprm, old_tsec, new_tsec); if (rc) return rc; } else { /* Check for a default transition on this program. */ rc = security_transition_sid(&selinux_state, old_tsec->sid, isec->sid, SECCLASS_PROCESS, NULL, &new_tsec->sid); if (rc) return rc; /* * Fallback to old SID on NNP or nosuid if not an allowed * transition. */ rc = check_nnp_nosuid(bprm, old_tsec, new_tsec); if (rc) new_tsec->sid = old_tsec->sid; } ad.type = LSM_AUDIT_DATA_FILE; ad.u.file = bprm->file; if (new_tsec->sid == old_tsec->sid) { rc = avc_has_perm(&selinux_state, old_tsec->sid, isec->sid, SECCLASS_FILE, FILE__EXECUTE_NO_TRANS, &ad); if (rc) return rc; } else { /* Check permissions for the transition. */ rc = avc_has_perm(&selinux_state, old_tsec->sid, new_tsec->sid, SECCLASS_PROCESS, PROCESS__TRANSITION, &ad); if (rc) return rc; rc = avc_has_perm(&selinux_state, new_tsec->sid, isec->sid, SECCLASS_FILE, FILE__ENTRYPOINT, &ad); if (rc) return rc; /* Check for shared state */ if (bprm->unsafe & LSM_UNSAFE_SHARE) { rc = avc_has_perm(&selinux_state, old_tsec->sid, new_tsec->sid, SECCLASS_PROCESS, PROCESS__SHARE, NULL); if (rc) return -EPERM; } /* Make sure that anyone attempting to ptrace over a task that * changes its SID has the appropriate permit */ if (bprm->unsafe & LSM_UNSAFE_PTRACE) { u32 ptsid = ptrace_parent_sid(); if (ptsid != 0) { rc = avc_has_perm(&selinux_state, ptsid, new_tsec->sid, SECCLASS_PROCESS, PROCESS__PTRACE, NULL); if (rc) return -EPERM; } } /* Clear any possibly unsafe personality bits on exec: */ bprm->per_clear |= PER_CLEAR_ON_SETID; /* Enable secure mode for SIDs transitions unless the noatsecure permission is granted between the two SIDs, i.e. ahp returns 0. */ rc = avc_has_perm(&selinux_state, old_tsec->sid, new_tsec->sid, SECCLASS_PROCESS, PROCESS__NOATSECURE, NULL); bprm->secureexec |= !!rc; } return 0; } static int match_file(const void *p, struct file *file, unsigned fd) { return file_has_perm(p, file, file_to_av(file)) ? fd + 1 : 0; } /* Derived from fs/exec.c:flush_old_files. */ static inline void flush_unauthorized_files(const struct cred *cred, struct files_struct *files) { struct file *file, *devnull = NULL; struct tty_struct *tty; int drop_tty = 0; unsigned n; tty = get_current_tty(); if (tty) { spin_lock(&tty->files_lock); if (!list_empty(&tty->tty_files)) { struct tty_file_private *file_priv; /* Revalidate access to controlling tty. Use file_path_has_perm on the tty path directly rather than using file_has_perm, as this particular open file may belong to another process and we are only interested in the inode-based check here. */ file_priv = list_first_entry(&tty->tty_files, struct tty_file_private, list); file = file_priv->file; if (file_path_has_perm(cred, file, FILE__READ | FILE__WRITE)) drop_tty = 1; } spin_unlock(&tty->files_lock); tty_kref_put(tty); } /* Reset controlling tty. */ if (drop_tty) no_tty(); /* Revalidate access to inherited open files. */ n = iterate_fd(files, 0, match_file, cred); if (!n) /* none found? */ return; devnull = dentry_open(&selinux_null, O_RDWR, cred); if (IS_ERR(devnull)) devnull = NULL; /* replace all the matching ones with this */ do { replace_fd(n - 1, devnull, 0); } while ((n = iterate_fd(files, n, match_file, cred)) != 0); if (devnull) fput(devnull); } /* * Prepare a process for imminent new credential changes due to exec */ static void selinux_bprm_committing_creds(struct linux_binprm *bprm) { struct task_security_struct *new_tsec; struct rlimit *rlim, *initrlim; int rc, i; new_tsec = selinux_cred(bprm->cred); if (new_tsec->sid == new_tsec->osid) return; /* Close files for which the new task SID is not authorized. */ flush_unauthorized_files(bprm->cred, current->files); /* Always clear parent death signal on SID transitions. */ current->pdeath_signal = 0; /* Check whether the new SID can inherit resource limits from the old * SID. If not, reset all soft limits to the lower of the current * task's hard limit and the init task's soft limit. * * Note that the setting of hard limits (even to lower them) can be * controlled by the setrlimit check. The inclusion of the init task's * soft limit into the computation is to avoid resetting soft limits * higher than the default soft limit for cases where the default is * lower than the hard limit, e.g. RLIMIT_CORE or RLIMIT_STACK. */ rc = avc_has_perm(&selinux_state, new_tsec->osid, new_tsec->sid, SECCLASS_PROCESS, PROCESS__RLIMITINH, NULL); if (rc) { /* protect against do_prlimit() */ task_lock(current); for (i = 0; i < RLIM_NLIMITS; i++) { rlim = current->signal->rlim + i; initrlim = init_task.signal->rlim + i; rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur); } task_unlock(current); if (IS_ENABLED(CONFIG_POSIX_TIMERS)) update_rlimit_cpu(current, rlimit(RLIMIT_CPU)); } } /* * Clean up the process immediately after the installation of new credentials * due to exec */ static void selinux_bprm_committed_creds(struct linux_binprm *bprm) { const struct task_security_struct *tsec = selinux_cred(current_cred()); u32 osid, sid; int rc; osid = tsec->osid; sid = tsec->sid; if (sid == osid) return; /* Check whether the new SID can inherit signal state from the old SID. * If not, clear itimers to avoid subsequent signal generation and * flush and unblock signals. * * This must occur _after_ the task SID has been updated so that any * kill done after the flush will be checked against the new SID. */ rc = avc_has_perm(&selinux_state, osid, sid, SECCLASS_PROCESS, PROCESS__SIGINH, NULL); if (rc) { clear_itimer(); spin_lock_irq(&unrcu_pointer(current->sighand)->siglock); if (!fatal_signal_pending(current)) { flush_sigqueue(¤t->pending); flush_sigqueue(¤t->signal->shared_pending); flush_signal_handlers(current, 1); sigemptyset(¤t->blocked); recalc_sigpending(); } spin_unlock_irq(&unrcu_pointer(current->sighand)->siglock); } /* Wake up the parent if it is waiting so that it can recheck * wait permission to the new task SID. */ read_lock(&tasklist_lock); __wake_up_parent(current, unrcu_pointer(current->real_parent)); read_unlock(&tasklist_lock); } /* superblock security operations */ static int selinux_sb_alloc_security(struct super_block *sb) { struct superblock_security_struct *sbsec = selinux_superblock(sb); mutex_init(&sbsec->lock); INIT_LIST_HEAD(&sbsec->isec_head); spin_lock_init(&sbsec->isec_lock); sbsec->sid = SECINITSID_UNLABELED; sbsec->def_sid = SECINITSID_FILE; sbsec->mntpoint_sid = SECINITSID_UNLABELED; return 0; } static inline int opt_len(const char *s) { bool open_quote = false; int len; char c; for (len = 0; (c = s[len]) != '\0'; len++) { if (c == '"') open_quote = !open_quote; if (c == ',' && !open_quote) break; } return len; } static int selinux_sb_eat_lsm_opts(char *options, void **mnt_opts) { char *from = options; char *to = options; bool first = true; int rc; while (1) { int len = opt_len(from); int token; char *arg = NULL; token = match_opt_prefix(from, len, &arg); if (token != Opt_error) { char *p, *q; /* strip quotes */ if (arg) { for (p = q = arg; p < from + len; p++) { char c = *p; if (c != '"') *q++ = c; } arg = kmemdup_nul(arg, q - arg, GFP_KERNEL); if (!arg) { rc = -ENOMEM; goto free_opt; } } rc = selinux_add_opt(token, arg, mnt_opts); kfree(arg); arg = NULL; if (unlikely(rc)) { goto free_opt; } } else { if (!first) { // copy with preceding comma from--; len++; } if (to != from) memmove(to, from, len); to += len; first = false; } if (!from[len]) break; from += len + 1; } *to = '\0'; return 0; free_opt: if (*mnt_opts) { selinux_free_mnt_opts(*mnt_opts); *mnt_opts = NULL; } return rc; } static int selinux_sb_mnt_opts_compat(struct super_block *sb, void *mnt_opts) { struct selinux_mnt_opts *opts = mnt_opts; struct superblock_security_struct *sbsec = selinux_superblock(sb); /* * Superblock not initialized (i.e. no options) - reject if any * options specified, otherwise accept. */ if (!(sbsec->flags & SE_SBINITIALIZED)) return opts ? 1 : 0; /* * Superblock initialized and no options specified - reject if * superblock has any options set, otherwise accept. */ if (!opts) return (sbsec->flags & SE_MNTMASK) ? 1 : 0; if (opts->fscontext_sid) { if (bad_option(sbsec, FSCONTEXT_MNT, sbsec->sid, opts->fscontext_sid)) return 1; } if (opts->context_sid) { if (bad_option(sbsec, CONTEXT_MNT, sbsec->mntpoint_sid, opts->context_sid)) return 1; } if (opts->rootcontext_sid) { struct inode_security_struct *root_isec; root_isec = backing_inode_security(sb->s_root); if (bad_option(sbsec, ROOTCONTEXT_MNT, root_isec->sid, opts->rootcontext_sid)) return 1; } if (opts->defcontext_sid) { if (bad_option(sbsec, DEFCONTEXT_MNT, sbsec->def_sid, opts->defcontext_sid)) return 1; } return 0; } static int selinux_sb_remount(struct super_block *sb, void *mnt_opts) { struct selinux_mnt_opts *opts = mnt_opts; struct superblock_security_struct *sbsec = selinux_superblock(sb); if (!(sbsec->flags & SE_SBINITIALIZED)) return 0; if (!opts) return 0; if (opts->fscontext_sid) { if (bad_option(sbsec, FSCONTEXT_MNT, sbsec->sid, opts->fscontext_sid)) goto out_bad_option; } if (opts->context_sid) { if (bad_option(sbsec, CONTEXT_MNT, sbsec->mntpoint_sid, opts->context_sid)) goto out_bad_option; } if (opts->rootcontext_sid) { struct inode_security_struct *root_isec; root_isec = backing_inode_security(sb->s_root); if (bad_option(sbsec, ROOTCONTEXT_MNT, root_isec->sid, opts->rootcontext_sid)) goto out_bad_option; } if (opts->defcontext_sid) { if (bad_option(sbsec, DEFCONTEXT_MNT, sbsec->def_sid, opts->defcontext_sid)) goto out_bad_option; } return 0; out_bad_option: pr_warn("SELinux: unable to change security options " "during remount (dev %s, type=%s)\n", sb->s_id, sb->s_type->name); return -EINVAL; } static int selinux_sb_kern_mount(struct super_block *sb) { const struct cred *cred = current_cred(); struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = sb->s_root; return superblock_has_perm(cred, sb, FILESYSTEM__MOUNT, &ad); } static int selinux_sb_statfs(struct dentry *dentry) { const struct cred *cred = current_cred(); struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry->d_sb->s_root; return superblock_has_perm(cred, dentry->d_sb, FILESYSTEM__GETATTR, &ad); } static int selinux_mount(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data) { const struct cred *cred = current_cred(); if (flags & MS_REMOUNT) return superblock_has_perm(cred, path->dentry->d_sb, FILESYSTEM__REMOUNT, NULL); else return path_has_perm(cred, path, FILE__MOUNTON); } static int selinux_move_mount(const struct path *from_path, const struct path *to_path) { const struct cred *cred = current_cred(); return path_has_perm(cred, to_path, FILE__MOUNTON); } static int selinux_umount(struct vfsmount *mnt, int flags) { const struct cred *cred = current_cred(); return superblock_has_perm(cred, mnt->mnt_sb, FILESYSTEM__UNMOUNT, NULL); } static int selinux_fs_context_submount(struct fs_context *fc, struct super_block *reference) { const struct superblock_security_struct *sbsec = selinux_superblock(reference); struct selinux_mnt_opts *opts; /* * Ensure that fc->security remains NULL when no options are set * as expected by selinux_set_mnt_opts(). */ if (!(sbsec->flags & (FSCONTEXT_MNT|CONTEXT_MNT|DEFCONTEXT_MNT))) return 0; opts = kzalloc(sizeof(*opts), GFP_KERNEL); if (!opts) return -ENOMEM; if (sbsec->flags & FSCONTEXT_MNT) opts->fscontext_sid = sbsec->sid; if (sbsec->flags & CONTEXT_MNT) opts->context_sid = sbsec->mntpoint_sid; if (sbsec->flags & DEFCONTEXT_MNT) opts->defcontext_sid = sbsec->def_sid; fc->security = opts; return 0; } static int selinux_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) { const struct selinux_mnt_opts *src = src_fc->security; if (!src) return 0; fc->security = kmemdup(src, sizeof(*src), GFP_KERNEL); return fc->security ? 0 : -ENOMEM; } static const struct fs_parameter_spec selinux_fs_parameters[] = { fsparam_string(CONTEXT_STR, Opt_context), fsparam_string(DEFCONTEXT_STR, Opt_defcontext), fsparam_string(FSCONTEXT_STR, Opt_fscontext), fsparam_string(ROOTCONTEXT_STR, Opt_rootcontext), fsparam_flag (SECLABEL_STR, Opt_seclabel), {} }; static int selinux_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct fs_parse_result result; int opt; opt = fs_parse(fc, selinux_fs_parameters, param, &result); if (opt < 0) return opt; return selinux_add_opt(opt, param->string, &fc->security); } /* inode security operations */ static int selinux_inode_alloc_security(struct inode *inode) { struct inode_security_struct *isec = selinux_inode(inode); u32 sid = current_sid(); spin_lock_init(&isec->lock); INIT_LIST_HEAD(&isec->list); isec->inode = inode; isec->sid = SECINITSID_UNLABELED; isec->sclass = SECCLASS_FILE; isec->task_sid = sid; isec->initialized = LABEL_INVALID; return 0; } static void selinux_inode_free_security(struct inode *inode) { inode_free_security(inode); } static int selinux_dentry_init_security(struct dentry *dentry, int mode, const struct qstr *name, const char **xattr_name, void **ctx, u32 *ctxlen) { u32 newsid; int rc; rc = selinux_determine_inode_label(selinux_cred(current_cred()), d_inode(dentry->d_parent), name, inode_mode_to_security_class(mode), &newsid); if (rc) return rc; if (xattr_name) *xattr_name = XATTR_NAME_SELINUX; return security_sid_to_context(&selinux_state, newsid, (char **)ctx, ctxlen); } static int selinux_dentry_create_files_as(struct dentry *dentry, int mode, struct qstr *name, const struct cred *old, struct cred *new) { u32 newsid; int rc; struct task_security_struct *tsec; rc = selinux_determine_inode_label(selinux_cred(old), d_inode(dentry->d_parent), name, inode_mode_to_security_class(mode), &newsid); if (rc) return rc; tsec = selinux_cred(new); tsec->create_sid = newsid; return 0; } static int selinux_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, const char **name, void **value, size_t *len) { const struct task_security_struct *tsec = selinux_cred(current_cred()); struct superblock_security_struct *sbsec; u32 newsid, clen; int rc; char *context; sbsec = selinux_superblock(dir->i_sb); newsid = tsec->create_sid; rc = selinux_determine_inode_label(tsec, dir, qstr, inode_mode_to_security_class(inode->i_mode), &newsid); if (rc) return rc; /* Possibly defer initialization to selinux_complete_init. */ if (sbsec->flags & SE_SBINITIALIZED) { struct inode_security_struct *isec = selinux_inode(inode); isec->sclass = inode_mode_to_security_class(inode->i_mode); isec->sid = newsid; isec->initialized = LABEL_INITIALIZED; } if (!selinux_initialized(&selinux_state) || !(sbsec->flags & SBLABEL_MNT)) return -EOPNOTSUPP; if (name) *name = XATTR_SELINUX_SUFFIX; if (value && len) { rc = security_sid_to_context_force(&selinux_state, newsid, &context, &clen); if (rc) return rc; *value = context; *len = clen; } return 0; } static int selinux_inode_init_security_anon(struct inode *inode, const struct qstr *name, const struct inode *context_inode) { const struct task_security_struct *tsec = selinux_cred(current_cred()); struct common_audit_data ad; struct inode_security_struct *isec; int rc; if (unlikely(!selinux_initialized(&selinux_state))) return 0; isec = selinux_inode(inode); /* * We only get here once per ephemeral inode. The inode has * been initialized via inode_alloc_security but is otherwise * untouched. */ if (context_inode) { struct inode_security_struct *context_isec = selinux_inode(context_inode); if (context_isec->initialized != LABEL_INITIALIZED) { pr_err("SELinux: context_inode is not initialized"); return -EACCES; } isec->sclass = context_isec->sclass; isec->sid = context_isec->sid; } else { isec->sclass = SECCLASS_ANON_INODE; rc = security_transition_sid( &selinux_state, tsec->sid, tsec->sid, isec->sclass, name, &isec->sid); if (rc) return rc; } isec->initialized = LABEL_INITIALIZED; /* * Now that we've initialized security, check whether we're * allowed to actually create this type of anonymous inode. */ ad.type = LSM_AUDIT_DATA_ANONINODE; ad.u.anonclass = name ? (const char *)name->name : "?"; return avc_has_perm(&selinux_state, tsec->sid, isec->sid, isec->sclass, FILE__CREATE, &ad); } static int selinux_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode) { return may_create(dir, dentry, SECCLASS_FILE); } static int selinux_inode_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) { return may_link(dir, old_dentry, MAY_LINK); } static int selinux_inode_unlink(struct inode *dir, struct dentry *dentry) { return may_link(dir, dentry, MAY_UNLINK); } static int selinux_inode_symlink(struct inode *dir, struct dentry *dentry, const char *name) { return may_create(dir, dentry, SECCLASS_LNK_FILE); } static int selinux_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mask) { return may_create(dir, dentry, SECCLASS_DIR); } static int selinux_inode_rmdir(struct inode *dir, struct dentry *dentry) { return may_link(dir, dentry, MAY_RMDIR); } static int selinux_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { return may_create(dir, dentry, inode_mode_to_security_class(mode)); } static int selinux_inode_rename(struct inode *old_inode, struct dentry *old_dentry, struct inode *new_inode, struct dentry *new_dentry) { return may_rename(old_inode, old_dentry, new_inode, new_dentry); } static int selinux_inode_readlink(struct dentry *dentry) { const struct cred *cred = current_cred(); return dentry_has_perm(cred, dentry, FILE__READ); } static int selinux_inode_follow_link(struct dentry *dentry, struct inode *inode, bool rcu) { const struct cred *cred = current_cred(); struct common_audit_data ad; struct inode_security_struct *isec; u32 sid; validate_creds(cred); ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; sid = cred_sid(cred); isec = inode_security_rcu(inode, rcu); if (IS_ERR(isec)) return PTR_ERR(isec); return avc_has_perm(&selinux_state, sid, isec->sid, isec->sclass, FILE__READ, &ad); } static noinline int audit_inode_permission(struct inode *inode, u32 perms, u32 audited, u32 denied, int result) { struct common_audit_data ad; struct inode_security_struct *isec = selinux_inode(inode); ad.type = LSM_AUDIT_DATA_INODE; ad.u.inode = inode; return slow_avc_audit(&selinux_state, current_sid(), isec->sid, isec->sclass, perms, audited, denied, result, &ad); } static int selinux_inode_permission(struct inode *inode, int mask) { const struct cred *cred = current_cred(); u32 perms; bool from_access; bool no_block = mask & MAY_NOT_BLOCK; struct inode_security_struct *isec; u32 sid; struct av_decision avd; int rc, rc2; u32 audited, denied; from_access = mask & MAY_ACCESS; mask &= (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND); /* No permission to check. Existence test. */ if (!mask) return 0; validate_creds(cred); if (unlikely(IS_PRIVATE(inode))) return 0; perms = file_mask_to_av(inode->i_mode, mask); sid = cred_sid(cred); isec = inode_security_rcu(inode, no_block); if (IS_ERR(isec)) return PTR_ERR(isec); rc = avc_has_perm_noaudit(&selinux_state, sid, isec->sid, isec->sclass, perms, 0, &avd); audited = avc_audit_required(perms, &avd, rc, from_access ? FILE__AUDIT_ACCESS : 0, &denied); if (likely(!audited)) return rc; rc2 = audit_inode_permission(inode, perms, audited, denied, rc); if (rc2) return rc2; return rc; } static int selinux_inode_setattr(struct dentry *dentry, struct iattr *iattr) { const struct cred *cred = current_cred(); struct inode *inode = d_backing_inode(dentry); unsigned int ia_valid = iattr->ia_valid; __u32 av = FILE__WRITE; /* ATTR_FORCE is just used for ATTR_KILL_S[UG]ID. */ if (ia_valid & ATTR_FORCE) { ia_valid &= ~(ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_MODE | ATTR_FORCE); if (!ia_valid) return 0; } if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_ATIME_SET | ATTR_MTIME_SET | ATTR_TIMES_SET)) return dentry_has_perm(cred, dentry, FILE__SETATTR); if (selinux_policycap_openperm() && inode->i_sb->s_magic != SOCKFS_MAGIC && (ia_valid & ATTR_SIZE) && !(ia_valid & ATTR_FILE)) av |= FILE__OPEN; return dentry_has_perm(cred, dentry, av); } static int selinux_inode_getattr(const struct path *path) { return path_has_perm(current_cred(), path, FILE__GETATTR); } static bool has_cap_mac_admin(bool audit) { const struct cred *cred = current_cred(); unsigned int opts = audit ? CAP_OPT_NONE : CAP_OPT_NOAUDIT; if (cap_capable(cred, &init_user_ns, CAP_MAC_ADMIN, opts)) return false; if (cred_has_capability(cred, CAP_MAC_ADMIN, opts, true)) return false; return true; } static int selinux_inode_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = d_backing_inode(dentry); struct inode_security_struct *isec; struct superblock_security_struct *sbsec; struct common_audit_data ad; u32 newsid, sid = current_sid(); int rc = 0; if (strcmp(name, XATTR_NAME_SELINUX)) { rc = cap_inode_setxattr(dentry, name, value, size, flags); if (rc) return rc; /* Not an attribute we recognize, so just check the ordinary setattr permission. */ return dentry_has_perm(current_cred(), dentry, FILE__SETATTR); } if (!selinux_initialized(&selinux_state)) return (inode_owner_or_capable(mnt_userns, inode) ? 0 : -EPERM); sbsec = selinux_superblock(inode->i_sb); if (!(sbsec->flags & SBLABEL_MNT)) return -EOPNOTSUPP; if (!inode_owner_or_capable(mnt_userns, inode)) return -EPERM; ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; isec = backing_inode_security(dentry); rc = avc_has_perm(&selinux_state, sid, isec->sid, isec->sclass, FILE__RELABELFROM, &ad); if (rc) return rc; rc = security_context_to_sid(&selinux_state, value, size, &newsid, GFP_KERNEL); if (rc == -EINVAL) { if (!has_cap_mac_admin(true)) { struct audit_buffer *ab; size_t audit_size; /* We strip a nul only if it is at the end, otherwise the * context contains a nul and we should audit that */ if (value) { const char *str = value; if (str[size - 1] == '\0') audit_size = size - 1; else audit_size = size; } else { audit_size = 0; } ab = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR); if (!ab) return rc; audit_log_format(ab, "op=setxattr invalid_context="); audit_log_n_untrustedstring(ab, value, audit_size); audit_log_end(ab); return rc; } rc = security_context_to_sid_force(&selinux_state, value, size, &newsid); } if (rc) return rc; rc = avc_has_perm(&selinux_state, sid, newsid, isec->sclass, FILE__RELABELTO, &ad); if (rc) return rc; rc = security_validate_transition(&selinux_state, isec->sid, newsid, sid, isec->sclass); if (rc) return rc; return avc_has_perm(&selinux_state, newsid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__ASSOCIATE, &ad); } static void selinux_inode_post_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = d_backing_inode(dentry); struct inode_security_struct *isec; u32 newsid; int rc; if (strcmp(name, XATTR_NAME_SELINUX)) { /* Not an attribute we recognize, so nothing to do. */ return; } if (!selinux_initialized(&selinux_state)) { /* If we haven't even been initialized, then we can't validate * against a policy, so leave the label as invalid. It may * resolve to a valid label on the next revalidation try if * we've since initialized. */ return; } rc = security_context_to_sid_force(&selinux_state, value, size, &newsid); if (rc) { pr_err("SELinux: unable to map context to SID" "for (%s, %lu), rc=%d\n", inode->i_sb->s_id, inode->i_ino, -rc); return; } isec = backing_inode_security(dentry); spin_lock(&isec->lock); isec->sclass = inode_mode_to_security_class(inode->i_mode); isec->sid = newsid; isec->initialized = LABEL_INITIALIZED; spin_unlock(&isec->lock); } static int selinux_inode_getxattr(struct dentry *dentry, const char *name) { const struct cred *cred = current_cred(); return dentry_has_perm(cred, dentry, FILE__GETATTR); } static int selinux_inode_listxattr(struct dentry *dentry) { const struct cred *cred = current_cred(); return dentry_has_perm(cred, dentry, FILE__GETATTR); } static int selinux_inode_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry, const char *name) { if (strcmp(name, XATTR_NAME_SELINUX)) { int rc = cap_inode_removexattr(mnt_userns, dentry, name); if (rc) return rc; /* Not an attribute we recognize, so just check the ordinary setattr permission. */ return dentry_has_perm(current_cred(), dentry, FILE__SETATTR); } if (!selinux_initialized(&selinux_state)) return 0; /* No one is allowed to remove a SELinux security label. You can change the label, but all data must be labeled. */ return -EACCES; } static int selinux_path_notify(const struct path *path, u64 mask, unsigned int obj_type) { int ret; u32 perm; struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_PATH; ad.u.path = *path; /* * Set permission needed based on the type of mark being set. * Performs an additional check for sb watches. */ switch (obj_type) { case FSNOTIFY_OBJ_TYPE_VFSMOUNT: perm = FILE__WATCH_MOUNT; break; case FSNOTIFY_OBJ_TYPE_SB: perm = FILE__WATCH_SB; ret = superblock_has_perm(current_cred(), path->dentry->d_sb, FILESYSTEM__WATCH, &ad); if (ret) return ret; break; case FSNOTIFY_OBJ_TYPE_INODE: perm = FILE__WATCH; break; default: return -EINVAL; } /* blocking watches require the file:watch_with_perm permission */ if (mask & (ALL_FSNOTIFY_PERM_EVENTS)) perm |= FILE__WATCH_WITH_PERM; /* watches on read-like events need the file:watch_reads permission */ if (mask & (FS_ACCESS | FS_ACCESS_PERM | FS_CLOSE_NOWRITE)) perm |= FILE__WATCH_READS; return path_has_perm(current_cred(), path, perm); } /* * Copy the inode security context value to the user. * * Permission check is handled by selinux_inode_getxattr hook. */ static int selinux_inode_getsecurity(struct user_namespace *mnt_userns, struct inode *inode, const char *name, void **buffer, bool alloc) { u32 size; int error; char *context = NULL; struct inode_security_struct *isec; /* * If we're not initialized yet, then we can't validate contexts, so * just let vfs_getxattr fall back to using the on-disk xattr. */ if (!selinux_initialized(&selinux_state) || strcmp(name, XATTR_SELINUX_SUFFIX)) return -EOPNOTSUPP; /* * If the caller has CAP_MAC_ADMIN, then get the raw context * value even if it is not defined by current policy; otherwise, * use the in-core value under current policy. * Use the non-auditing forms of the permission checks since * getxattr may be called by unprivileged processes commonly * and lack of permission just means that we fall back to the * in-core context value, not a denial. */ isec = inode_security(inode); if (has_cap_mac_admin(false)) error = security_sid_to_context_force(&selinux_state, isec->sid, &context, &size); else error = security_sid_to_context(&selinux_state, isec->sid, &context, &size); if (error) return error; error = size; if (alloc) { *buffer = context; goto out_nofree; } kfree(context); out_nofree: return error; } static int selinux_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags) { struct inode_security_struct *isec = inode_security_novalidate(inode); struct superblock_security_struct *sbsec; u32 newsid; int rc; if (strcmp(name, XATTR_SELINUX_SUFFIX)) return -EOPNOTSUPP; sbsec = selinux_superblock(inode->i_sb); if (!(sbsec->flags & SBLABEL_MNT)) return -EOPNOTSUPP; if (!value || !size) return -EACCES; rc = security_context_to_sid(&selinux_state, value, size, &newsid, GFP_KERNEL); if (rc) return rc; spin_lock(&isec->lock); isec->sclass = inode_mode_to_security_class(inode->i_mode); isec->sid = newsid; isec->initialized = LABEL_INITIALIZED; spin_unlock(&isec->lock); return 0; } static int selinux_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size) { const int len = sizeof(XATTR_NAME_SELINUX); if (!selinux_initialized(&selinux_state)) return 0; if (buffer && len <= buffer_size) memcpy(buffer, XATTR_NAME_SELINUX, len); return len; } static void selinux_inode_getsecid(struct inode *inode, u32 *secid) { struct inode_security_struct *isec = inode_security_novalidate(inode); *secid = isec->sid; } static int selinux_inode_copy_up(struct dentry *src, struct cred **new) { u32 sid; struct task_security_struct *tsec; struct cred *new_creds = *new; if (new_creds == NULL) { new_creds = prepare_creds(); if (!new_creds) return -ENOMEM; } tsec = selinux_cred(new_creds); /* Get label from overlay inode and set it in create_sid */ selinux_inode_getsecid(d_inode(src), &sid); tsec->create_sid = sid; *new = new_creds; return 0; } static int selinux_inode_copy_up_xattr(const char *name) { /* The copy_up hook above sets the initial context on an inode, but we * don't then want to overwrite it by blindly copying all the lower * xattrs up. Instead, we have to filter out SELinux-related xattrs. */ if (strcmp(name, XATTR_NAME_SELINUX) == 0) return 1; /* Discard */ /* * Any other attribute apart from SELINUX is not claimed, supported * by selinux. */ return -EOPNOTSUPP; } /* kernfs node operations */ static int selinux_kernfs_init_security(struct kernfs_node *kn_dir, struct kernfs_node *kn) { const struct task_security_struct *tsec = selinux_cred(current_cred()); u32 parent_sid, newsid, clen; int rc; char *context; rc = kernfs_xattr_get(kn_dir, XATTR_NAME_SELINUX, NULL, 0); if (rc == -ENODATA) return 0; else if (rc < 0) return rc; clen = (u32)rc; context = kmalloc(clen, GFP_KERNEL); if (!context) return -ENOMEM; rc = kernfs_xattr_get(kn_dir, XATTR_NAME_SELINUX, context, clen); if (rc < 0) { kfree(context); return rc; } rc = security_context_to_sid(&selinux_state, context, clen, &parent_sid, GFP_KERNEL); kfree(context); if (rc) return rc; if (tsec->create_sid) { newsid = tsec->create_sid; } else { u16 secclass = inode_mode_to_security_class(kn->mode); struct qstr q; q.name = kn->name; q.hash_len = hashlen_string(kn_dir, kn->name); rc = security_transition_sid(&selinux_state, tsec->sid, parent_sid, secclass, &q, &newsid); if (rc) return rc; } rc = security_sid_to_context_force(&selinux_state, newsid, &context, &clen); if (rc) return rc; rc = kernfs_xattr_set(kn, XATTR_NAME_SELINUX, context, clen, XATTR_CREATE); kfree(context); return rc; } /* file security operations */ static int selinux_revalidate_file_permission(struct file *file, int mask) { const struct cred *cred = current_cred(); struct inode *inode = file_inode(file); /* file_mask_to_av won't add FILE__WRITE if MAY_APPEND is set */ if ((file->f_flags & O_APPEND) && (mask & MAY_WRITE)) mask |= MAY_APPEND; return file_has_perm(cred, file, file_mask_to_av(inode->i_mode, mask)); } static int selinux_file_permission(struct file *file, int mask) { struct inode *inode = file_inode(file); struct file_security_struct *fsec = selinux_file(file); struct inode_security_struct *isec; u32 sid = current_sid(); if (!mask) /* No permission to check. Existence test. */ return 0; isec = inode_security(inode); if (sid == fsec->sid && fsec->isid == isec->sid && fsec->pseqno == avc_policy_seqno(&selinux_state)) /* No change since file_open check. */ return 0; return selinux_revalidate_file_permission(file, mask); } static int selinux_file_alloc_security(struct file *file) { struct file_security_struct *fsec = selinux_file(file); u32 sid = current_sid(); fsec->sid = sid; fsec->fown_sid = sid; return 0; } /* * Check whether a task has the ioctl permission and cmd * operation to an inode. */ static int ioctl_has_perm(const struct cred *cred, struct file *file, u32 requested, u16 cmd) { struct common_audit_data ad; struct file_security_struct *fsec = selinux_file(file); struct inode *inode = file_inode(file); struct inode_security_struct *isec; struct lsm_ioctlop_audit ioctl; u32 ssid = cred_sid(cred); int rc; u8 driver = cmd >> 8; u8 xperm = cmd & 0xff; ad.type = LSM_AUDIT_DATA_IOCTL_OP; ad.u.op = &ioctl; ad.u.op->cmd = cmd; ad.u.op->path = file->f_path; if (ssid != fsec->sid) { rc = avc_has_perm(&selinux_state, ssid, fsec->sid, SECCLASS_FD, FD__USE, &ad); if (rc) goto out; } if (unlikely(IS_PRIVATE(inode))) return 0; isec = inode_security(inode); rc = avc_has_extended_perms(&selinux_state, ssid, isec->sid, isec->sclass, requested, driver, xperm, &ad); out: return rc; } static int selinux_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { const struct cred *cred = current_cred(); int error = 0; switch (cmd) { case FIONREAD: case FIBMAP: case FIGETBSZ: case FS_IOC_GETFLAGS: case FS_IOC_GETVERSION: error = file_has_perm(cred, file, FILE__GETATTR); break; case FS_IOC_SETFLAGS: case FS_IOC_SETVERSION: error = file_has_perm(cred, file, FILE__SETATTR); break; /* sys_ioctl() checks */ case FIONBIO: case FIOASYNC: error = file_has_perm(cred, file, 0); break; case KDSKBENT: case KDSKBSENT: error = cred_has_capability(cred, CAP_SYS_TTY_CONFIG, CAP_OPT_NONE, true); break; case FIOCLEX: case FIONCLEX: if (!selinux_policycap_ioctl_skip_cloexec()) error = ioctl_has_perm(cred, file, FILE__IOCTL, (u16) cmd); break; /* default case assumes that the command will go * to the file's ioctl() function. */ default: error = ioctl_has_perm(cred, file, FILE__IOCTL, (u16) cmd); } return error; } static int selinux_file_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg) { /* * If we are in a 64-bit kernel running 32-bit userspace, we need to * make sure we don't compare 32-bit flags to 64-bit flags. */ switch (cmd) { case FS_IOC32_GETFLAGS: cmd = FS_IOC_GETFLAGS; break; case FS_IOC32_SETFLAGS: cmd = FS_IOC_SETFLAGS; break; case FS_IOC32_GETVERSION: cmd = FS_IOC_GETVERSION; break; case FS_IOC32_SETVERSION: cmd = FS_IOC_SETVERSION; break; default: break; } return selinux_file_ioctl(file, cmd, arg); } static int default_noexec __ro_after_init; static int file_map_prot_check(struct file *file, unsigned long prot, int shared) { const struct cred *cred = current_cred(); u32 sid = cred_sid(cred); int rc = 0; if (default_noexec && (prot & PROT_EXEC) && (!file || IS_PRIVATE(file_inode(file)) || (!shared && (prot & PROT_WRITE)))) { /* * We are making executable an anonymous mapping or a * private file mapping that will also be writable. * This has an additional check. */ rc = avc_has_perm(&selinux_state, sid, sid, SECCLASS_PROCESS, PROCESS__EXECMEM, NULL); if (rc) goto error; } if (file) { /* read access is always possible with a mapping */ u32 av = FILE__READ; /* write access only matters if the mapping is shared */ if (shared && (prot & PROT_WRITE)) av |= FILE__WRITE; if (prot & PROT_EXEC) av |= FILE__EXECUTE; return file_has_perm(cred, file, av); } error: return rc; } static int selinux_mmap_addr(unsigned long addr) { int rc = 0; if (addr < CONFIG_LSM_MMAP_MIN_ADDR) { u32 sid = current_sid(); rc = avc_has_perm(&selinux_state, sid, sid, SECCLASS_MEMPROTECT, MEMPROTECT__MMAP_ZERO, NULL); } return rc; } static int selinux_mmap_file(struct file *file, unsigned long reqprot, unsigned long prot, unsigned long flags) { struct common_audit_data ad; int rc; if (file) { ad.type = LSM_AUDIT_DATA_FILE; ad.u.file = file; rc = inode_has_perm(current_cred(), file_inode(file), FILE__MAP, &ad); if (rc) return rc; } if (checkreqprot_get(&selinux_state)) prot = reqprot; return file_map_prot_check(file, prot, (flags & MAP_TYPE) == MAP_SHARED); } static int selinux_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot) { const struct cred *cred = current_cred(); u32 sid = cred_sid(cred); if (checkreqprot_get(&selinux_state)) prot = reqprot; if (default_noexec && (prot & PROT_EXEC) && !(vma->vm_flags & VM_EXEC)) { int rc = 0; if (vma->vm_start >= vma->vm_mm->start_brk && vma->vm_end <= vma->vm_mm->brk) { rc = avc_has_perm(&selinux_state, sid, sid, SECCLASS_PROCESS, PROCESS__EXECHEAP, NULL); } else if (!vma->vm_file && ((vma->vm_start <= vma->vm_mm->start_stack && vma->vm_end >= vma->vm_mm->start_stack) || vma_is_stack_for_current(vma))) { rc = avc_has_perm(&selinux_state, sid, sid, SECCLASS_PROCESS, PROCESS__EXECSTACK, NULL); } else if (vma->vm_file && vma->anon_vma) { /* * We are making executable a file mapping that has * had some COW done. Since pages might have been * written, check ability to execute the possibly * modified content. This typically should only * occur for text relocations. */ rc = file_has_perm(cred, vma->vm_file, FILE__EXECMOD); } if (rc) return rc; } return file_map_prot_check(vma->vm_file, prot, vma->vm_flags&VM_SHARED); } static int selinux_file_lock(struct file *file, unsigned int cmd) { const struct cred *cred = current_cred(); return file_has_perm(cred, file, FILE__LOCK); } static int selinux_file_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { const struct cred *cred = current_cred(); int err = 0; switch (cmd) { case F_SETFL: if ((file->f_flags & O_APPEND) && !(arg & O_APPEND)) { err = file_has_perm(cred, file, FILE__WRITE); break; } fallthrough; case F_SETOWN: case F_SETSIG: case F_GETFL: case F_GETOWN: case F_GETSIG: case F_GETOWNER_UIDS: /* Just check FD__USE permission */ err = file_has_perm(cred, file, 0); break; case F_GETLK: case F_SETLK: case F_SETLKW: case F_OFD_GETLK: case F_OFD_SETLK: case F_OFD_SETLKW: #if BITS_PER_LONG == 32 case F_GETLK64: case F_SETLK64: case F_SETLKW64: #endif err = file_has_perm(cred, file, FILE__LOCK); break; } return err; } static void selinux_file_set_fowner(struct file *file) { struct file_security_struct *fsec; fsec = selinux_file(file); fsec->fown_sid = current_sid(); } static int selinux_file_send_sigiotask(struct task_struct *tsk, struct fown_struct *fown, int signum) { struct file *file; u32 sid = task_sid_obj(tsk); u32 perm; struct file_security_struct *fsec; /* struct fown_struct is never outside the context of a struct file */ file = container_of(fown, struct file, f_owner); fsec = selinux_file(file); if (!signum) perm = signal_to_av(SIGIO); /* as per send_sigio_to_task */ else perm = signal_to_av(signum); return avc_has_perm(&selinux_state, fsec->fown_sid, sid, SECCLASS_PROCESS, perm, NULL); } static int selinux_file_receive(struct file *file) { const struct cred *cred = current_cred(); return file_has_perm(cred, file, file_to_av(file)); } static int selinux_file_open(struct file *file) { struct file_security_struct *fsec; struct inode_security_struct *isec; fsec = selinux_file(file); isec = inode_security(file_inode(file)); /* * Save inode label and policy sequence number * at open-time so that selinux_file_permission * can determine whether revalidation is necessary. * Task label is already saved in the file security * struct as its SID. */ fsec->isid = isec->sid; fsec->pseqno = avc_policy_seqno(&selinux_state); /* * Since the inode label or policy seqno may have changed * between the selinux_inode_permission check and the saving * of state above, recheck that access is still permitted. * Otherwise, access might never be revalidated against the * new inode label or new policy. * This check is not redundant - do not remove. */ return file_path_has_perm(file->f_cred, file, open_file_to_av(file)); } /* task security operations */ static int selinux_task_alloc(struct task_struct *task, unsigned long clone_flags) { u32 sid = current_sid(); return avc_has_perm(&selinux_state, sid, sid, SECCLASS_PROCESS, PROCESS__FORK, NULL); } /* * prepare a new set of credentials for modification */ static int selinux_cred_prepare(struct cred *new, const struct cred *old, gfp_t gfp) { const struct task_security_struct *old_tsec = selinux_cred(old); struct task_security_struct *tsec = selinux_cred(new); *tsec = *old_tsec; return 0; } /* * transfer the SELinux data to a blank set of creds */ static void selinux_cred_transfer(struct cred *new, const struct cred *old) { const struct task_security_struct *old_tsec = selinux_cred(old); struct task_security_struct *tsec = selinux_cred(new); *tsec = *old_tsec; } static void selinux_cred_getsecid(const struct cred *c, u32 *secid) { *secid = cred_sid(c); } /* * set the security data for a kernel service * - all the creation contexts are set to unlabelled */ static int selinux_kernel_act_as(struct cred *new, u32 secid) { struct task_security_struct *tsec = selinux_cred(new); u32 sid = current_sid(); int ret; ret = avc_has_perm(&selinux_state, sid, secid, SECCLASS_KERNEL_SERVICE, KERNEL_SERVICE__USE_AS_OVERRIDE, NULL); if (ret == 0) { tsec->sid = secid; tsec->create_sid = 0; tsec->keycreate_sid = 0; tsec->sockcreate_sid = 0; } return ret; } /* * set the file creation context in a security record to the same as the * objective context of the specified inode */ static int selinux_kernel_create_files_as(struct cred *new, struct inode *inode) { struct inode_security_struct *isec = inode_security(inode); struct task_security_struct *tsec = selinux_cred(new); u32 sid = current_sid(); int ret; ret = avc_has_perm(&selinux_state, sid, isec->sid, SECCLASS_KERNEL_SERVICE, KERNEL_SERVICE__CREATE_FILES_AS, NULL); if (ret == 0) tsec->create_sid = isec->sid; return ret; } static int selinux_kernel_module_request(char *kmod_name) { struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_KMOD; ad.u.kmod_name = kmod_name; return avc_has_perm(&selinux_state, current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__MODULE_REQUEST, &ad); } static int selinux_kernel_module_from_file(struct file *file) { struct common_audit_data ad; struct inode_security_struct *isec; struct file_security_struct *fsec; u32 sid = current_sid(); int rc; /* init_module */ if (file == NULL) return avc_has_perm(&selinux_state, sid, sid, SECCLASS_SYSTEM, SYSTEM__MODULE_LOAD, NULL); /* finit_module */ ad.type = LSM_AUDIT_DATA_FILE; ad.u.file = file; fsec = selinux_file(file); if (sid != fsec->sid) { rc = avc_has_perm(&selinux_state, sid, fsec->sid, SECCLASS_FD, FD__USE, &ad); if (rc) return rc; } isec = inode_security(file_inode(file)); return avc_has_perm(&selinux_state, sid, isec->sid, SECCLASS_SYSTEM, SYSTEM__MODULE_LOAD, &ad); } static int selinux_kernel_read_file(struct file *file, enum kernel_read_file_id id, bool contents) { int rc = 0; switch (id) { case READING_MODULE: rc = selinux_kernel_module_from_file(contents ? file : NULL); break; default: break; } return rc; } static int selinux_kernel_load_data(enum kernel_load_data_id id, bool contents) { int rc = 0; switch (id) { case LOADING_MODULE: rc = selinux_kernel_module_from_file(NULL); break; default: break; } return rc; } static int selinux_task_setpgid(struct task_struct *p, pid_t pgid) { return avc_has_perm(&selinux_state, current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__SETPGID, NULL); } static int selinux_task_getpgid(struct task_struct *p) { return avc_has_perm(&selinux_state, current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__GETPGID, NULL); } static int selinux_task_getsid(struct task_struct *p) { return avc_has_perm(&selinux_state, current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__GETSESSION, NULL); } static void selinux_current_getsecid_subj(u32 *secid) { *secid = current_sid(); } static void selinux_task_getsecid_obj(struct task_struct *p, u32 *secid) { *secid = task_sid_obj(p); } static int selinux_task_setnice(struct task_struct *p, int nice) { return avc_has_perm(&selinux_state, current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__SETSCHED, NULL); } static int selinux_task_setioprio(struct task_struct *p, int ioprio) { return avc_has_perm(&selinux_state, current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__SETSCHED, NULL); } static int selinux_task_getioprio(struct task_struct *p) { return avc_has_perm(&selinux_state, current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__GETSCHED, NULL); } static int selinux_task_prlimit(const struct cred *cred, const struct cred *tcred, unsigned int flags) { u32 av = 0; if (!flags) return 0; if (flags & LSM_PRLIMIT_WRITE) av |= PROCESS__SETRLIMIT; if (flags & LSM_PRLIMIT_READ) av |= PROCESS__GETRLIMIT; return avc_has_perm(&selinux_state, cred_sid(cred), cred_sid(tcred), SECCLASS_PROCESS, av, NULL); } static int selinux_task_setrlimit(struct task_struct *p, unsigned int resource, struct rlimit *new_rlim) { struct rlimit *old_rlim = p->signal->rlim + resource; /* Control the ability to change the hard limit (whether lowering or raising it), so that the hard limit can later be used as a safe reset point for the soft limit upon context transitions. See selinux_bprm_committing_creds. */ if (old_rlim->rlim_max != new_rlim->rlim_max) return avc_has_perm(&selinux_state, current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__SETRLIMIT, NULL); return 0; } static int selinux_task_setscheduler(struct task_struct *p) { return avc_has_perm(&selinux_state, current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__SETSCHED, NULL); } static int selinux_task_getscheduler(struct task_struct *p) { return avc_has_perm(&selinux_state, current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__GETSCHED, NULL); } static int selinux_task_movememory(struct task_struct *p) { return avc_has_perm(&selinux_state, current_sid(), task_sid_obj(p), SECCLASS_PROCESS, PROCESS__SETSCHED, NULL); } static int selinux_task_kill(struct task_struct *p, struct kernel_siginfo *info, int sig, const struct cred *cred) { u32 secid; u32 perm; if (!sig) perm = PROCESS__SIGNULL; /* null signal; existence test */ else perm = signal_to_av(sig); if (!cred) secid = current_sid(); else secid = cred_sid(cred); return avc_has_perm(&selinux_state, secid, task_sid_obj(p), SECCLASS_PROCESS, perm, NULL); } static void selinux_task_to_inode(struct task_struct *p, struct inode *inode) { struct inode_security_struct *isec = selinux_inode(inode); u32 sid = task_sid_obj(p); spin_lock(&isec->lock); isec->sclass = inode_mode_to_security_class(inode->i_mode); isec->sid = sid; isec->initialized = LABEL_INITIALIZED; spin_unlock(&isec->lock); } static int selinux_userns_create(const struct cred *cred) { u32 sid = current_sid(); return avc_has_perm(&selinux_state, sid, sid, SECCLASS_USER_NAMESPACE, USER_NAMESPACE__CREATE, NULL); } /* Returns error only if unable to parse addresses */ static int selinux_parse_skb_ipv4(struct sk_buff *skb, struct common_audit_data *ad, u8 *proto) { int offset, ihlen, ret = -EINVAL; struct iphdr _iph, *ih; offset = skb_network_offset(skb); ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); if (ih == NULL) goto out; ihlen = ih->ihl * 4; if (ihlen < sizeof(_iph)) goto out; ad->u.net->v4info.saddr = ih->saddr; ad->u.net->v4info.daddr = ih->daddr; ret = 0; if (proto) *proto = ih->protocol; switch (ih->protocol) { case IPPROTO_TCP: { struct tcphdr _tcph, *th; if (ntohs(ih->frag_off) & IP_OFFSET) break; offset += ihlen; th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); if (th == NULL) break; ad->u.net->sport = th->source; ad->u.net->dport = th->dest; break; } case IPPROTO_UDP: { struct udphdr _udph, *uh; if (ntohs(ih->frag_off) & IP_OFFSET) break; offset += ihlen; uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); if (uh == NULL) break; ad->u.net->sport = uh->source; ad->u.net->dport = uh->dest; break; } case IPPROTO_DCCP: { struct dccp_hdr _dccph, *dh; if (ntohs(ih->frag_off) & IP_OFFSET) break; offset += ihlen; dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph); if (dh == NULL) break; ad->u.net->sport = dh->dccph_sport; ad->u.net->dport = dh->dccph_dport; break; } #if IS_ENABLED(CONFIG_IP_SCTP) case IPPROTO_SCTP: { struct sctphdr _sctph, *sh; if (ntohs(ih->frag_off) & IP_OFFSET) break; offset += ihlen; sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph); if (sh == NULL) break; ad->u.net->sport = sh->source; ad->u.net->dport = sh->dest; break; } #endif default: break; } out: return ret; } #if IS_ENABLED(CONFIG_IPV6) /* Returns error only if unable to parse addresses */ static int selinux_parse_skb_ipv6(struct sk_buff *skb, struct common_audit_data *ad, u8 *proto) { u8 nexthdr; int ret = -EINVAL, offset; struct ipv6hdr _ipv6h, *ip6; __be16 frag_off; offset = skb_network_offset(skb); ip6 = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); if (ip6 == NULL) goto out; ad->u.net->v6info.saddr = ip6->saddr; ad->u.net->v6info.daddr = ip6->daddr; ret = 0; nexthdr = ip6->nexthdr; offset += sizeof(_ipv6h); offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off); if (offset < 0) goto out; if (proto) *proto = nexthdr; switch (nexthdr) { case IPPROTO_TCP: { struct tcphdr _tcph, *th; th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); if (th == NULL) break; ad->u.net->sport = th->source; ad->u.net->dport = th->dest; break; } case IPPROTO_UDP: { struct udphdr _udph, *uh; uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); if (uh == NULL) break; ad->u.net->sport = uh->source; ad->u.net->dport = uh->dest; break; } case IPPROTO_DCCP: { struct dccp_hdr _dccph, *dh; dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph); if (dh == NULL) break; ad->u.net->sport = dh->dccph_sport; ad->u.net->dport = dh->dccph_dport; break; } #if IS_ENABLED(CONFIG_IP_SCTP) case IPPROTO_SCTP: { struct sctphdr _sctph, *sh; sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph); if (sh == NULL) break; ad->u.net->sport = sh->source; ad->u.net->dport = sh->dest; break; } #endif /* includes fragments */ default: break; } out: return ret; } #endif /* IPV6 */ static int selinux_parse_skb(struct sk_buff *skb, struct common_audit_data *ad, char **_addrp, int src, u8 *proto) { char *addrp; int ret; switch (ad->u.net->family) { case PF_INET: ret = selinux_parse_skb_ipv4(skb, ad, proto); if (ret) goto parse_error; addrp = (char *)(src ? &ad->u.net->v4info.saddr : &ad->u.net->v4info.daddr); goto okay; #if IS_ENABLED(CONFIG_IPV6) case PF_INET6: ret = selinux_parse_skb_ipv6(skb, ad, proto); if (ret) goto parse_error; addrp = (char *)(src ? &ad->u.net->v6info.saddr : &ad->u.net->v6info.daddr); goto okay; #endif /* IPV6 */ default: addrp = NULL; goto okay; } parse_error: pr_warn( "SELinux: failure in selinux_parse_skb()," " unable to parse packet\n"); return ret; okay: if (_addrp) *_addrp = addrp; return 0; } /** * selinux_skb_peerlbl_sid - Determine the peer label of a packet * @skb: the packet * @family: protocol family * @sid: the packet's peer label SID * * Description: * Check the various different forms of network peer labeling and determine * the peer label/SID for the packet; most of the magic actually occurs in * the security server function security_net_peersid_cmp(). The function * returns zero if the value in @sid is valid (although it may be SECSID_NULL) * or -EACCES if @sid is invalid due to inconsistencies with the different * peer labels. * */ static int selinux_skb_peerlbl_sid(struct sk_buff *skb, u16 family, u32 *sid) { int err; u32 xfrm_sid; u32 nlbl_sid; u32 nlbl_type; err = selinux_xfrm_skb_sid(skb, &xfrm_sid); if (unlikely(err)) return -EACCES; err = selinux_netlbl_skbuff_getsid(skb, family, &nlbl_type, &nlbl_sid); if (unlikely(err)) return -EACCES; err = security_net_peersid_resolve(&selinux_state, nlbl_sid, nlbl_type, xfrm_sid, sid); if (unlikely(err)) { pr_warn( "SELinux: failure in selinux_skb_peerlbl_sid()," " unable to determine packet's peer label\n"); return -EACCES; } return 0; } /** * selinux_conn_sid - Determine the child socket label for a connection * @sk_sid: the parent socket's SID * @skb_sid: the packet's SID * @conn_sid: the resulting connection SID * * If @skb_sid is valid then the user:role:type information from @sk_sid is * combined with the MLS information from @skb_sid in order to create * @conn_sid. If @skb_sid is not valid then @conn_sid is simply a copy * of @sk_sid. Returns zero on success, negative values on failure. * */ static int selinux_conn_sid(u32 sk_sid, u32 skb_sid, u32 *conn_sid) { int err = 0; if (skb_sid != SECSID_NULL) err = security_sid_mls_copy(&selinux_state, sk_sid, skb_sid, conn_sid); else *conn_sid = sk_sid; return err; } /* socket security operations */ static int socket_sockcreate_sid(const struct task_security_struct *tsec, u16 secclass, u32 *socksid) { if (tsec->sockcreate_sid > SECSID_NULL) { *socksid = tsec->sockcreate_sid; return 0; } return security_transition_sid(&selinux_state, tsec->sid, tsec->sid, secclass, NULL, socksid); } static int sock_has_perm(struct sock *sk, u32 perms) { struct sk_security_struct *sksec = sk->sk_security; struct common_audit_data ad; struct lsm_network_audit net = {0,}; if (sksec->sid == SECINITSID_KERNEL) return 0; ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->sk = sk; return avc_has_perm(&selinux_state, current_sid(), sksec->sid, sksec->sclass, perms, &ad); } static int selinux_socket_create(int family, int type, int protocol, int kern) { const struct task_security_struct *tsec = selinux_cred(current_cred()); u32 newsid; u16 secclass; int rc; if (kern) return 0; secclass = socket_type_to_security_class(family, type, protocol); rc = socket_sockcreate_sid(tsec, secclass, &newsid); if (rc) return rc; return avc_has_perm(&selinux_state, tsec->sid, newsid, secclass, SOCKET__CREATE, NULL); } static int selinux_socket_post_create(struct socket *sock, int family, int type, int protocol, int kern) { const struct task_security_struct *tsec = selinux_cred(current_cred()); struct inode_security_struct *isec = inode_security_novalidate(SOCK_INODE(sock)); struct sk_security_struct *sksec; u16 sclass = socket_type_to_security_class(family, type, protocol); u32 sid = SECINITSID_KERNEL; int err = 0; if (!kern) { err = socket_sockcreate_sid(tsec, sclass, &sid); if (err) return err; } isec->sclass = sclass; isec->sid = sid; isec->initialized = LABEL_INITIALIZED; if (sock->sk) { sksec = sock->sk->sk_security; sksec->sclass = sclass; sksec->sid = sid; /* Allows detection of the first association on this socket */ if (sksec->sclass == SECCLASS_SCTP_SOCKET) sksec->sctp_assoc_state = SCTP_ASSOC_UNSET; err = selinux_netlbl_socket_post_create(sock->sk, family); } return err; } static int selinux_socket_socketpair(struct socket *socka, struct socket *sockb) { struct sk_security_struct *sksec_a = socka->sk->sk_security; struct sk_security_struct *sksec_b = sockb->sk->sk_security; sksec_a->peer_sid = sksec_b->sid; sksec_b->peer_sid = sksec_a->sid; return 0; } /* Range of port numbers used to automatically bind. Need to determine whether we should perform a name_bind permission check between the socket and the port number. */ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen) { struct sock *sk = sock->sk; struct sk_security_struct *sksec = sk->sk_security; u16 family; int err; err = sock_has_perm(sk, SOCKET__BIND); if (err) goto out; /* If PF_INET or PF_INET6, check name_bind permission for the port. */ family = sk->sk_family; if (family == PF_INET || family == PF_INET6) { char *addrp; struct common_audit_data ad; struct lsm_network_audit net = {0,}; struct sockaddr_in *addr4 = NULL; struct sockaddr_in6 *addr6 = NULL; u16 family_sa; unsigned short snum; u32 sid, node_perm; /* * sctp_bindx(3) calls via selinux_sctp_bind_connect() * that validates multiple binding addresses. Because of this * need to check address->sa_family as it is possible to have * sk->sk_family = PF_INET6 with addr->sa_family = AF_INET. */ if (addrlen < offsetofend(struct sockaddr, sa_family)) return -EINVAL; family_sa = address->sa_family; switch (family_sa) { case AF_UNSPEC: case AF_INET: if (addrlen < sizeof(struct sockaddr_in)) return -EINVAL; addr4 = (struct sockaddr_in *)address; if (family_sa == AF_UNSPEC) { if (family == PF_INET6) { /* Length check from inet6_bind_sk() */ if (addrlen < SIN6_LEN_RFC2133) return -EINVAL; /* Family check from __inet6_bind() */ goto err_af; } /* see __inet_bind(), we only want to allow * AF_UNSPEC if the address is INADDR_ANY */ if (addr4->sin_addr.s_addr != htonl(INADDR_ANY)) goto err_af; family_sa = AF_INET; } snum = ntohs(addr4->sin_port); addrp = (char *)&addr4->sin_addr.s_addr; break; case AF_INET6: if (addrlen < SIN6_LEN_RFC2133) return -EINVAL; addr6 = (struct sockaddr_in6 *)address; snum = ntohs(addr6->sin6_port); addrp = (char *)&addr6->sin6_addr.s6_addr; break; default: goto err_af; } ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->sport = htons(snum); ad.u.net->family = family_sa; if (snum) { int low, high; inet_get_local_port_range(sock_net(sk), &low, &high); if (inet_port_requires_bind_service(sock_net(sk), snum) || snum < low || snum > high) { err = sel_netport_sid(sk->sk_protocol, snum, &sid); if (err) goto out; err = avc_has_perm(&selinux_state, sksec->sid, sid, sksec->sclass, SOCKET__NAME_BIND, &ad); if (err) goto out; } } switch (sksec->sclass) { case SECCLASS_TCP_SOCKET: node_perm = TCP_SOCKET__NODE_BIND; break; case SECCLASS_UDP_SOCKET: node_perm = UDP_SOCKET__NODE_BIND; break; case SECCLASS_DCCP_SOCKET: node_perm = DCCP_SOCKET__NODE_BIND; break; case SECCLASS_SCTP_SOCKET: node_perm = SCTP_SOCKET__NODE_BIND; break; default: node_perm = RAWIP_SOCKET__NODE_BIND; break; } err = sel_netnode_sid(addrp, family_sa, &sid); if (err) goto out; if (family_sa == AF_INET) ad.u.net->v4info.saddr = addr4->sin_addr.s_addr; else ad.u.net->v6info.saddr = addr6->sin6_addr; err = avc_has_perm(&selinux_state, sksec->sid, sid, sksec->sclass, node_perm, &ad); if (err) goto out; } out: return err; err_af: /* Note that SCTP services expect -EINVAL, others -EAFNOSUPPORT. */ if (sksec->sclass == SECCLASS_SCTP_SOCKET) return -EINVAL; return -EAFNOSUPPORT; } /* This supports connect(2) and SCTP connect services such as sctp_connectx(3) * and sctp_sendmsg(3) as described in Documentation/security/SCTP.rst */ static int selinux_socket_connect_helper(struct socket *sock, struct sockaddr *address, int addrlen) { struct sock *sk = sock->sk; struct sk_security_struct *sksec = sk->sk_security; int err; err = sock_has_perm(sk, SOCKET__CONNECT); if (err) return err; if (addrlen < offsetofend(struct sockaddr, sa_family)) return -EINVAL; /* connect(AF_UNSPEC) has special handling, as it is a documented * way to disconnect the socket */ if (address->sa_family == AF_UNSPEC) return 0; /* * If a TCP, DCCP or SCTP socket, check name_connect permission * for the port. */ if (sksec->sclass == SECCLASS_TCP_SOCKET || sksec->sclass == SECCLASS_DCCP_SOCKET || sksec->sclass == SECCLASS_SCTP_SOCKET) { struct common_audit_data ad; struct lsm_network_audit net = {0,}; struct sockaddr_in *addr4 = NULL; struct sockaddr_in6 *addr6 = NULL; unsigned short snum; u32 sid, perm; /* sctp_connectx(3) calls via selinux_sctp_bind_connect() * that validates multiple connect addresses. Because of this * need to check address->sa_family as it is possible to have * sk->sk_family = PF_INET6 with addr->sa_family = AF_INET. */ switch (address->sa_family) { case AF_INET: addr4 = (struct sockaddr_in *)address; if (addrlen < sizeof(struct sockaddr_in)) return -EINVAL; snum = ntohs(addr4->sin_port); break; case AF_INET6: addr6 = (struct sockaddr_in6 *)address; if (addrlen < SIN6_LEN_RFC2133) return -EINVAL; snum = ntohs(addr6->sin6_port); break; default: /* Note that SCTP services expect -EINVAL, whereas * others expect -EAFNOSUPPORT. */ if (sksec->sclass == SECCLASS_SCTP_SOCKET) return -EINVAL; else return -EAFNOSUPPORT; } err = sel_netport_sid(sk->sk_protocol, snum, &sid); if (err) return err; switch (sksec->sclass) { case SECCLASS_TCP_SOCKET: perm = TCP_SOCKET__NAME_CONNECT; break; case SECCLASS_DCCP_SOCKET: perm = DCCP_SOCKET__NAME_CONNECT; break; case SECCLASS_SCTP_SOCKET: perm = SCTP_SOCKET__NAME_CONNECT; break; } ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->dport = htons(snum); ad.u.net->family = address->sa_family; err = avc_has_perm(&selinux_state, sksec->sid, sid, sksec->sclass, perm, &ad); if (err) return err; } return 0; } /* Supports connect(2), see comments in selinux_socket_connect_helper() */ static int selinux_socket_connect(struct socket *sock, struct sockaddr *address, int addrlen) { int err; struct sock *sk = sock->sk; err = selinux_socket_connect_helper(sock, address, addrlen); if (err) return err; return selinux_netlbl_socket_connect(sk, address); } static int selinux_socket_listen(struct socket *sock, int backlog) { return sock_has_perm(sock->sk, SOCKET__LISTEN); } static int selinux_socket_accept(struct socket *sock, struct socket *newsock) { int err; struct inode_security_struct *isec; struct inode_security_struct *newisec; u16 sclass; u32 sid; err = sock_has_perm(sock->sk, SOCKET__ACCEPT); if (err) return err; isec = inode_security_novalidate(SOCK_INODE(sock)); spin_lock(&isec->lock); sclass = isec->sclass; sid = isec->sid; spin_unlock(&isec->lock); newisec = inode_security_novalidate(SOCK_INODE(newsock)); newisec->sclass = sclass; newisec->sid = sid; newisec->initialized = LABEL_INITIALIZED; return 0; } static int selinux_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size) { return sock_has_perm(sock->sk, SOCKET__WRITE); } static int selinux_socket_recvmsg(struct socket *sock, struct msghdr *msg, int size, int flags) { return sock_has_perm(sock->sk, SOCKET__READ); } static int selinux_socket_getsockname(struct socket *sock) { return sock_has_perm(sock->sk, SOCKET__GETATTR); } static int selinux_socket_getpeername(struct socket *sock) { return sock_has_perm(sock->sk, SOCKET__GETATTR); } static int selinux_socket_setsockopt(struct socket *sock, int level, int optname) { int err; err = sock_has_perm(sock->sk, SOCKET__SETOPT); if (err) return err; return selinux_netlbl_socket_setsockopt(sock, level, optname); } static int selinux_socket_getsockopt(struct socket *sock, int level, int optname) { return sock_has_perm(sock->sk, SOCKET__GETOPT); } static int selinux_socket_shutdown(struct socket *sock, int how) { return sock_has_perm(sock->sk, SOCKET__SHUTDOWN); } static int selinux_socket_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk) { struct sk_security_struct *sksec_sock = sock->sk_security; struct sk_security_struct *sksec_other = other->sk_security; struct sk_security_struct *sksec_new = newsk->sk_security; struct common_audit_data ad; struct lsm_network_audit net = {0,}; int err; ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->sk = other; err = avc_has_perm(&selinux_state, sksec_sock->sid, sksec_other->sid, sksec_other->sclass, UNIX_STREAM_SOCKET__CONNECTTO, &ad); if (err) return err; /* server child socket */ sksec_new->peer_sid = sksec_sock->sid; err = security_sid_mls_copy(&selinux_state, sksec_other->sid, sksec_sock->sid, &sksec_new->sid); if (err) return err; /* connecting socket */ sksec_sock->peer_sid = sksec_new->sid; return 0; } static int selinux_socket_unix_may_send(struct socket *sock, struct socket *other) { struct sk_security_struct *ssec = sock->sk->sk_security; struct sk_security_struct *osec = other->sk->sk_security; struct common_audit_data ad; struct lsm_network_audit net = {0,}; ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->sk = other->sk; return avc_has_perm(&selinux_state, ssec->sid, osec->sid, osec->sclass, SOCKET__SENDTO, &ad); } static int selinux_inet_sys_rcv_skb(struct net *ns, int ifindex, char *addrp, u16 family, u32 peer_sid, struct common_audit_data *ad) { int err; u32 if_sid; u32 node_sid; err = sel_netif_sid(ns, ifindex, &if_sid); if (err) return err; err = avc_has_perm(&selinux_state, peer_sid, if_sid, SECCLASS_NETIF, NETIF__INGRESS, ad); if (err) return err; err = sel_netnode_sid(addrp, family, &node_sid); if (err) return err; return avc_has_perm(&selinux_state, peer_sid, node_sid, SECCLASS_NODE, NODE__RECVFROM, ad); } static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb, u16 family) { int err = 0; struct sk_security_struct *sksec = sk->sk_security; u32 sk_sid = sksec->sid; struct common_audit_data ad; struct lsm_network_audit net = {0,}; char *addrp; ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->netif = skb->skb_iif; ad.u.net->family = family; err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL); if (err) return err; if (selinux_secmark_enabled()) { err = avc_has_perm(&selinux_state, sk_sid, skb->secmark, SECCLASS_PACKET, PACKET__RECV, &ad); if (err) return err; } err = selinux_netlbl_sock_rcv_skb(sksec, skb, family, &ad); if (err) return err; err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, &ad); return err; } static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) { int err; struct sk_security_struct *sksec = sk->sk_security; u16 family = sk->sk_family; u32 sk_sid = sksec->sid; struct common_audit_data ad; struct lsm_network_audit net = {0,}; char *addrp; u8 secmark_active; u8 peerlbl_active; if (family != PF_INET && family != PF_INET6) return 0; /* Handle mapped IPv4 packets arriving via IPv6 sockets */ if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP)) family = PF_INET; /* If any sort of compatibility mode is enabled then handoff processing * to the selinux_sock_rcv_skb_compat() function to deal with the * special handling. We do this in an attempt to keep this function * as fast and as clean as possible. */ if (!selinux_policycap_netpeer()) return selinux_sock_rcv_skb_compat(sk, skb, family); secmark_active = selinux_secmark_enabled(); peerlbl_active = selinux_peerlbl_enabled(); if (!secmark_active && !peerlbl_active) return 0; ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->netif = skb->skb_iif; ad.u.net->family = family; err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL); if (err) return err; if (peerlbl_active) { u32 peer_sid; err = selinux_skb_peerlbl_sid(skb, family, &peer_sid); if (err) return err; err = selinux_inet_sys_rcv_skb(sock_net(sk), skb->skb_iif, addrp, family, peer_sid, &ad); if (err) { selinux_netlbl_err(skb, family, err, 0); return err; } err = avc_has_perm(&selinux_state, sk_sid, peer_sid, SECCLASS_PEER, PEER__RECV, &ad); if (err) { selinux_netlbl_err(skb, family, err, 0); return err; } } if (secmark_active) { err = avc_has_perm(&selinux_state, sk_sid, skb->secmark, SECCLASS_PACKET, PACKET__RECV, &ad); if (err) return err; } return err; } static int selinux_socket_getpeersec_stream(struct socket *sock, char __user *optval, int __user *optlen, unsigned len) { int err = 0; char *scontext; u32 scontext_len; struct sk_security_struct *sksec = sock->sk->sk_security; u32 peer_sid = SECSID_NULL; if (sksec->sclass == SECCLASS_UNIX_STREAM_SOCKET || sksec->sclass == SECCLASS_TCP_SOCKET || sksec->sclass == SECCLASS_SCTP_SOCKET) peer_sid = sksec->peer_sid; if (peer_sid == SECSID_NULL) return -ENOPROTOOPT; err = security_sid_to_context(&selinux_state, peer_sid, &scontext, &scontext_len); if (err) return err; if (scontext_len > len) { err = -ERANGE; goto out_len; } if (copy_to_user(optval, scontext, scontext_len)) err = -EFAULT; out_len: if (put_user(scontext_len, optlen)) err = -EFAULT; kfree(scontext); return err; } static int selinux_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) { u32 peer_secid = SECSID_NULL; u16 family; struct inode_security_struct *isec; if (skb && skb->protocol == htons(ETH_P_IP)) family = PF_INET; else if (skb && skb->protocol == htons(ETH_P_IPV6)) family = PF_INET6; else if (sock) family = sock->sk->sk_family; else goto out; if (sock && family == PF_UNIX) { isec = inode_security_novalidate(SOCK_INODE(sock)); peer_secid = isec->sid; } else if (skb) selinux_skb_peerlbl_sid(skb, family, &peer_secid); out: *secid = peer_secid; if (peer_secid == SECSID_NULL) return -EINVAL; return 0; } static int selinux_sk_alloc_security(struct sock *sk, int family, gfp_t priority) { struct sk_security_struct *sksec; sksec = kzalloc(sizeof(*sksec), priority); if (!sksec) return -ENOMEM; sksec->peer_sid = SECINITSID_UNLABELED; sksec->sid = SECINITSID_UNLABELED; sksec->sclass = SECCLASS_SOCKET; selinux_netlbl_sk_security_reset(sksec); sk->sk_security = sksec; return 0; } static void selinux_sk_free_security(struct sock *sk) { struct sk_security_struct *sksec = sk->sk_security; sk->sk_security = NULL; selinux_netlbl_sk_security_free(sksec); kfree(sksec); } static void selinux_sk_clone_security(const struct sock *sk, struct sock *newsk) { struct sk_security_struct *sksec = sk->sk_security; struct sk_security_struct *newsksec = newsk->sk_security; newsksec->sid = sksec->sid; newsksec->peer_sid = sksec->peer_sid; newsksec->sclass = sksec->sclass; selinux_netlbl_sk_security_reset(newsksec); } static void selinux_sk_getsecid(struct sock *sk, u32 *secid) { if (!sk) *secid = SECINITSID_ANY_SOCKET; else { struct sk_security_struct *sksec = sk->sk_security; *secid = sksec->sid; } } static void selinux_sock_graft(struct sock *sk, struct socket *parent) { struct inode_security_struct *isec = inode_security_novalidate(SOCK_INODE(parent)); struct sk_security_struct *sksec = sk->sk_security; if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6 || sk->sk_family == PF_UNIX) isec->sid = sksec->sid; sksec->sclass = isec->sclass; } /* * Determines peer_secid for the asoc and updates socket's peer label * if it's the first association on the socket. */ static int selinux_sctp_process_new_assoc(struct sctp_association *asoc, struct sk_buff *skb) { struct sock *sk = asoc->base.sk; u16 family = sk->sk_family; struct sk_security_struct *sksec = sk->sk_security; struct common_audit_data ad; struct lsm_network_audit net = {0,}; int err; /* handle mapped IPv4 packets arriving via IPv6 sockets */ if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP)) family = PF_INET; if (selinux_peerlbl_enabled()) { asoc->peer_secid = SECSID_NULL; /* This will return peer_sid = SECSID_NULL if there are * no peer labels, see security_net_peersid_resolve(). */ err = selinux_skb_peerlbl_sid(skb, family, &asoc->peer_secid); if (err) return err; if (asoc->peer_secid == SECSID_NULL) asoc->peer_secid = SECINITSID_UNLABELED; } else { asoc->peer_secid = SECINITSID_UNLABELED; } if (sksec->sctp_assoc_state == SCTP_ASSOC_UNSET) { sksec->sctp_assoc_state = SCTP_ASSOC_SET; /* Here as first association on socket. As the peer SID * was allowed by peer recv (and the netif/node checks), * then it is approved by policy and used as the primary * peer SID for getpeercon(3). */ sksec->peer_sid = asoc->peer_secid; } else if (sksec->peer_sid != asoc->peer_secid) { /* Other association peer SIDs are checked to enforce * consistency among the peer SIDs. */ ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->sk = asoc->base.sk; err = avc_has_perm(&selinux_state, sksec->peer_sid, asoc->peer_secid, sksec->sclass, SCTP_SOCKET__ASSOCIATION, &ad); if (err) return err; } return 0; } /* Called whenever SCTP receives an INIT or COOKIE ECHO chunk. This * happens on an incoming connect(2), sctp_connectx(3) or * sctp_sendmsg(3) (with no association already present). */ static int selinux_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb) { struct sk_security_struct *sksec = asoc->base.sk->sk_security; u32 conn_sid; int err; if (!selinux_policycap_extsockclass()) return 0; err = selinux_sctp_process_new_assoc(asoc, skb); if (err) return err; /* Compute the MLS component for the connection and store * the information in asoc. This will be used by SCTP TCP type * sockets and peeled off connections as they cause a new * socket to be generated. selinux_sctp_sk_clone() will then * plug this into the new socket. */ err = selinux_conn_sid(sksec->sid, asoc->peer_secid, &conn_sid); if (err) return err; asoc->secid = conn_sid; /* Set any NetLabel labels including CIPSO/CALIPSO options. */ return selinux_netlbl_sctp_assoc_request(asoc, skb); } /* Called when SCTP receives a COOKIE ACK chunk as the final * response to an association request (initited by us). */ static int selinux_sctp_assoc_established(struct sctp_association *asoc, struct sk_buff *skb) { struct sk_security_struct *sksec = asoc->base.sk->sk_security; if (!selinux_policycap_extsockclass()) return 0; /* Inherit secid from the parent socket - this will be picked up * by selinux_sctp_sk_clone() if the association gets peeled off * into a new socket. */ asoc->secid = sksec->sid; return selinux_sctp_process_new_assoc(asoc, skb); } /* Check if sctp IPv4/IPv6 addresses are valid for binding or connecting * based on their @optname. */ static int selinux_sctp_bind_connect(struct sock *sk, int optname, struct sockaddr *address, int addrlen) { int len, err = 0, walk_size = 0; void *addr_buf; struct sockaddr *addr; struct socket *sock; if (!selinux_policycap_extsockclass()) return 0; /* Process one or more addresses that may be IPv4 or IPv6 */ sock = sk->sk_socket; addr_buf = address; while (walk_size < addrlen) { if (walk_size + sizeof(sa_family_t) > addrlen) return -EINVAL; addr = addr_buf; switch (addr->sa_family) { case AF_UNSPEC: case AF_INET: len = sizeof(struct sockaddr_in); break; case AF_INET6: len = sizeof(struct sockaddr_in6); break; default: return -EINVAL; } if (walk_size + len > addrlen) return -EINVAL; err = -EINVAL; switch (optname) { /* Bind checks */ case SCTP_PRIMARY_ADDR: case SCTP_SET_PEER_PRIMARY_ADDR: case SCTP_SOCKOPT_BINDX_ADD: err = selinux_socket_bind(sock, addr, len); break; /* Connect checks */ case SCTP_SOCKOPT_CONNECTX: case SCTP_PARAM_SET_PRIMARY: case SCTP_PARAM_ADD_IP: case SCTP_SENDMSG_CONNECT: err = selinux_socket_connect_helper(sock, addr, len); if (err) return err; /* As selinux_sctp_bind_connect() is called by the * SCTP protocol layer, the socket is already locked, * therefore selinux_netlbl_socket_connect_locked() * is called here. The situations handled are: * sctp_connectx(3), sctp_sendmsg(3), sendmsg(2), * whenever a new IP address is added or when a new * primary address is selected. * Note that an SCTP connect(2) call happens before * the SCTP protocol layer and is handled via * selinux_socket_connect(). */ err = selinux_netlbl_socket_connect_locked(sk, addr); break; } if (err) return err; addr_buf += len; walk_size += len; } return 0; } /* Called whenever a new socket is created by accept(2) or sctp_peeloff(3). */ static void selinux_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, struct sock *newsk) { struct sk_security_struct *sksec = sk->sk_security; struct sk_security_struct *newsksec = newsk->sk_security; /* If policy does not support SECCLASS_SCTP_SOCKET then call * the non-sctp clone version. */ if (!selinux_policycap_extsockclass()) return selinux_sk_clone_security(sk, newsk); newsksec->sid = asoc->secid; newsksec->peer_sid = asoc->peer_secid; newsksec->sclass = sksec->sclass; selinux_netlbl_sctp_sk_clone(sk, newsk); } static int selinux_inet_conn_request(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { struct sk_security_struct *sksec = sk->sk_security; int err; u16 family = req->rsk_ops->family; u32 connsid; u32 peersid; err = selinux_skb_peerlbl_sid(skb, family, &peersid); if (err) return err; err = selinux_conn_sid(sksec->sid, peersid, &connsid); if (err) return err; req->secid = connsid; req->peer_secid = peersid; return selinux_netlbl_inet_conn_request(req, family); } static void selinux_inet_csk_clone(struct sock *newsk, const struct request_sock *req) { struct sk_security_struct *newsksec = newsk->sk_security; newsksec->sid = req->secid; newsksec->peer_sid = req->peer_secid; /* NOTE: Ideally, we should also get the isec->sid for the new socket in sync, but we don't have the isec available yet. So we will wait until sock_graft to do it, by which time it will have been created and available. */ /* We don't need to take any sort of lock here as we are the only * thread with access to newsksec */ selinux_netlbl_inet_csk_clone(newsk, req->rsk_ops->family); } static void selinux_inet_conn_established(struct sock *sk, struct sk_buff *skb) { u16 family = sk->sk_family; struct sk_security_struct *sksec = sk->sk_security; /* handle mapped IPv4 packets arriving via IPv6 sockets */ if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP)) family = PF_INET; selinux_skb_peerlbl_sid(skb, family, &sksec->peer_sid); } static int selinux_secmark_relabel_packet(u32 sid) { const struct task_security_struct *__tsec; u32 tsid; __tsec = selinux_cred(current_cred()); tsid = __tsec->sid; return avc_has_perm(&selinux_state, tsid, sid, SECCLASS_PACKET, PACKET__RELABELTO, NULL); } static void selinux_secmark_refcount_inc(void) { atomic_inc(&selinux_secmark_refcount); } static void selinux_secmark_refcount_dec(void) { atomic_dec(&selinux_secmark_refcount); } static void selinux_req_classify_flow(const struct request_sock *req, struct flowi_common *flic) { flic->flowic_secid = req->secid; } static int selinux_tun_dev_alloc_security(void **security) { struct tun_security_struct *tunsec; tunsec = kzalloc(sizeof(*tunsec), GFP_KERNEL); if (!tunsec) return -ENOMEM; tunsec->sid = current_sid(); *security = tunsec; return 0; } static void selinux_tun_dev_free_security(void *security) { kfree(security); } static int selinux_tun_dev_create(void) { u32 sid = current_sid(); /* we aren't taking into account the "sockcreate" SID since the socket * that is being created here is not a socket in the traditional sense, * instead it is a private sock, accessible only to the kernel, and * representing a wide range of network traffic spanning multiple * connections unlike traditional sockets - check the TUN driver to * get a better understanding of why this socket is special */ return avc_has_perm(&selinux_state, sid, sid, SECCLASS_TUN_SOCKET, TUN_SOCKET__CREATE, NULL); } static int selinux_tun_dev_attach_queue(void *security) { struct tun_security_struct *tunsec = security; return avc_has_perm(&selinux_state, current_sid(), tunsec->sid, SECCLASS_TUN_SOCKET, TUN_SOCKET__ATTACH_QUEUE, NULL); } static int selinux_tun_dev_attach(struct sock *sk, void *security) { struct tun_security_struct *tunsec = security; struct sk_security_struct *sksec = sk->sk_security; /* we don't currently perform any NetLabel based labeling here and it * isn't clear that we would want to do so anyway; while we could apply * labeling without the support of the TUN user the resulting labeled * traffic from the other end of the connection would almost certainly * cause confusion to the TUN user that had no idea network labeling * protocols were being used */ sksec->sid = tunsec->sid; sksec->sclass = SECCLASS_TUN_SOCKET; return 0; } static int selinux_tun_dev_open(void *security) { struct tun_security_struct *tunsec = security; u32 sid = current_sid(); int err; err = avc_has_perm(&selinux_state, sid, tunsec->sid, SECCLASS_TUN_SOCKET, TUN_SOCKET__RELABELFROM, NULL); if (err) return err; err = avc_has_perm(&selinux_state, sid, sid, SECCLASS_TUN_SOCKET, TUN_SOCKET__RELABELTO, NULL); if (err) return err; tunsec->sid = sid; return 0; } #ifdef CONFIG_NETFILTER static unsigned int selinux_ip_forward(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { int ifindex; u16 family; char *addrp; u32 peer_sid; struct common_audit_data ad; struct lsm_network_audit net = {0,}; int secmark_active, peerlbl_active; if (!selinux_policycap_netpeer()) return NF_ACCEPT; secmark_active = selinux_secmark_enabled(); peerlbl_active = selinux_peerlbl_enabled(); if (!secmark_active && !peerlbl_active) return NF_ACCEPT; family = state->pf; if (selinux_skb_peerlbl_sid(skb, family, &peer_sid) != 0) return NF_DROP; ifindex = state->in->ifindex; ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->netif = ifindex; ad.u.net->family = family; if (selinux_parse_skb(skb, &ad, &addrp, 1, NULL) != 0) return NF_DROP; if (peerlbl_active) { int err; err = selinux_inet_sys_rcv_skb(state->net, ifindex, addrp, family, peer_sid, &ad); if (err) { selinux_netlbl_err(skb, family, err, 1); return NF_DROP; } } if (secmark_active) if (avc_has_perm(&selinux_state, peer_sid, skb->secmark, SECCLASS_PACKET, PACKET__FORWARD_IN, &ad)) return NF_DROP; if (netlbl_enabled()) /* we do this in the FORWARD path and not the POST_ROUTING * path because we want to make sure we apply the necessary * labeling before IPsec is applied so we can leverage AH * protection */ if (selinux_netlbl_skbuff_setsid(skb, family, peer_sid) != 0) return NF_DROP; return NF_ACCEPT; } static unsigned int selinux_ip_output(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct sock *sk; u32 sid; if (!netlbl_enabled()) return NF_ACCEPT; /* we do this in the LOCAL_OUT path and not the POST_ROUTING path * because we want to make sure we apply the necessary labeling * before IPsec is applied so we can leverage AH protection */ sk = skb->sk; if (sk) { struct sk_security_struct *sksec; if (sk_listener(sk)) /* if the socket is the listening state then this * packet is a SYN-ACK packet which means it needs to * be labeled based on the connection/request_sock and * not the parent socket. unfortunately, we can't * lookup the request_sock yet as it isn't queued on * the parent socket until after the SYN-ACK is sent. * the "solution" is to simply pass the packet as-is * as any IP option based labeling should be copied * from the initial connection request (in the IP * layer). it is far from ideal, but until we get a * security label in the packet itself this is the * best we can do. */ return NF_ACCEPT; /* standard practice, label using the parent socket */ sksec = sk->sk_security; sid = sksec->sid; } else sid = SECINITSID_KERNEL; if (selinux_netlbl_skbuff_setsid(skb, state->pf, sid) != 0) return NF_DROP; return NF_ACCEPT; } static unsigned int selinux_ip_postroute_compat(struct sk_buff *skb, const struct nf_hook_state *state) { struct sock *sk; struct sk_security_struct *sksec; struct common_audit_data ad; struct lsm_network_audit net = {0,}; u8 proto = 0; sk = skb_to_full_sk(skb); if (sk == NULL) return NF_ACCEPT; sksec = sk->sk_security; ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->netif = state->out->ifindex; ad.u.net->family = state->pf; if (selinux_parse_skb(skb, &ad, NULL, 0, &proto)) return NF_DROP; if (selinux_secmark_enabled()) if (avc_has_perm(&selinux_state, sksec->sid, skb->secmark, SECCLASS_PACKET, PACKET__SEND, &ad)) return NF_DROP_ERR(-ECONNREFUSED); if (selinux_xfrm_postroute_last(sksec->sid, skb, &ad, proto)) return NF_DROP_ERR(-ECONNREFUSED); return NF_ACCEPT; } static unsigned int selinux_ip_postroute(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { u16 family; u32 secmark_perm; u32 peer_sid; int ifindex; struct sock *sk; struct common_audit_data ad; struct lsm_network_audit net = {0,}; char *addrp; int secmark_active, peerlbl_active; /* If any sort of compatibility mode is enabled then handoff processing * to the selinux_ip_postroute_compat() function to deal with the * special handling. We do this in an attempt to keep this function * as fast and as clean as possible. */ if (!selinux_policycap_netpeer()) return selinux_ip_postroute_compat(skb, state); secmark_active = selinux_secmark_enabled(); peerlbl_active = selinux_peerlbl_enabled(); if (!secmark_active && !peerlbl_active) return NF_ACCEPT; sk = skb_to_full_sk(skb); #ifdef CONFIG_XFRM /* If skb->dst->xfrm is non-NULL then the packet is undergoing an IPsec * packet transformation so allow the packet to pass without any checks * since we'll have another chance to perform access control checks * when the packet is on it's final way out. * NOTE: there appear to be some IPv6 multicast cases where skb->dst * is NULL, in this case go ahead and apply access control. * NOTE: if this is a local socket (skb->sk != NULL) that is in the * TCP listening state we cannot wait until the XFRM processing * is done as we will miss out on the SA label if we do; * unfortunately, this means more work, but it is only once per * connection. */ if (skb_dst(skb) != NULL && skb_dst(skb)->xfrm != NULL && !(sk && sk_listener(sk))) return NF_ACCEPT; #endif family = state->pf; if (sk == NULL) { /* Without an associated socket the packet is either coming * from the kernel or it is being forwarded; check the packet * to determine which and if the packet is being forwarded * query the packet directly to determine the security label. */ if (skb->skb_iif) { secmark_perm = PACKET__FORWARD_OUT; if (selinux_skb_peerlbl_sid(skb, family, &peer_sid)) return NF_DROP; } else { secmark_perm = PACKET__SEND; peer_sid = SECINITSID_KERNEL; } } else if (sk_listener(sk)) { /* Locally generated packet but the associated socket is in the * listening state which means this is a SYN-ACK packet. In * this particular case the correct security label is assigned * to the connection/request_sock but unfortunately we can't * query the request_sock as it isn't queued on the parent * socket until after the SYN-ACK packet is sent; the only * viable choice is to regenerate the label like we do in * selinux_inet_conn_request(). See also selinux_ip_output() * for similar problems. */ u32 skb_sid; struct sk_security_struct *sksec; sksec = sk->sk_security; if (selinux_skb_peerlbl_sid(skb, family, &skb_sid)) return NF_DROP; /* At this point, if the returned skb peerlbl is SECSID_NULL * and the packet has been through at least one XFRM * transformation then we must be dealing with the "final" * form of labeled IPsec packet; since we've already applied * all of our access controls on this packet we can safely * pass the packet. */ if (skb_sid == SECSID_NULL) { switch (family) { case PF_INET: if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) return NF_ACCEPT; break; case PF_INET6: if (IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) return NF_ACCEPT; break; default: return NF_DROP_ERR(-ECONNREFUSED); } } if (selinux_conn_sid(sksec->sid, skb_sid, &peer_sid)) return NF_DROP; secmark_perm = PACKET__SEND; } else { /* Locally generated packet, fetch the security label from the * associated socket. */ struct sk_security_struct *sksec = sk->sk_security; peer_sid = sksec->sid; secmark_perm = PACKET__SEND; } ifindex = state->out->ifindex; ad.type = LSM_AUDIT_DATA_NET; ad.u.net = &net; ad.u.net->netif = ifindex; ad.u.net->family = family; if (selinux_parse_skb(skb, &ad, &addrp, 0, NULL)) return NF_DROP; if (secmark_active) if (avc_has_perm(&selinux_state, peer_sid, skb->secmark, SECCLASS_PACKET, secmark_perm, &ad)) return NF_DROP_ERR(-ECONNREFUSED); if (peerlbl_active) { u32 if_sid; u32 node_sid; if (sel_netif_sid(state->net, ifindex, &if_sid)) return NF_DROP; if (avc_has_perm(&selinux_state, peer_sid, if_sid, SECCLASS_NETIF, NETIF__EGRESS, &ad)) return NF_DROP_ERR(-ECONNREFUSED); if (sel_netnode_sid(addrp, family, &node_sid)) return NF_DROP; if (avc_has_perm(&selinux_state, peer_sid, node_sid, SECCLASS_NODE, NODE__SENDTO, &ad)) return NF_DROP_ERR(-ECONNREFUSED); } return NF_ACCEPT; } #endif /* CONFIG_NETFILTER */ static int selinux_netlink_send(struct sock *sk, struct sk_buff *skb) { int rc = 0; unsigned int msg_len; unsigned int data_len = skb->len; unsigned char *data = skb->data; struct nlmsghdr *nlh; struct sk_security_struct *sksec = sk->sk_security; u16 sclass = sksec->sclass; u32 perm; while (data_len >= nlmsg_total_size(0)) { nlh = (struct nlmsghdr *)data; /* NOTE: the nlmsg_len field isn't reliably set by some netlink * users which means we can't reject skb's with bogus * length fields; our solution is to follow what * netlink_rcv_skb() does and simply skip processing at * messages with length fields that are clearly junk */ if (nlh->nlmsg_len < NLMSG_HDRLEN || nlh->nlmsg_len > data_len) return 0; rc = selinux_nlmsg_lookup(sclass, nlh->nlmsg_type, &perm); if (rc == 0) { rc = sock_has_perm(sk, perm); if (rc) return rc; } else if (rc == -EINVAL) { /* -EINVAL is a missing msg/perm mapping */ pr_warn_ratelimited("SELinux: unrecognized netlink" " message: protocol=%hu nlmsg_type=%hu sclass=%s" " pid=%d comm=%s\n", sk->sk_protocol, nlh->nlmsg_type, secclass_map[sclass - 1].name, task_pid_nr(current), current->comm); if (enforcing_enabled(&selinux_state) && !security_get_allow_unknown(&selinux_state)) return rc; rc = 0; } else if (rc == -ENOENT) { /* -ENOENT is a missing socket/class mapping, ignore */ rc = 0; } else { return rc; } /* move to the next message after applying netlink padding */ msg_len = NLMSG_ALIGN(nlh->nlmsg_len); if (msg_len >= data_len) return 0; data_len -= msg_len; data += msg_len; } return rc; } static void ipc_init_security(struct ipc_security_struct *isec, u16 sclass) { isec->sclass = sclass; isec->sid = current_sid(); } static int ipc_has_perm(struct kern_ipc_perm *ipc_perms, u32 perms) { struct ipc_security_struct *isec; struct common_audit_data ad; u32 sid = current_sid(); isec = selinux_ipc(ipc_perms); ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = ipc_perms->key; return avc_has_perm(&selinux_state, sid, isec->sid, isec->sclass, perms, &ad); } static int selinux_msg_msg_alloc_security(struct msg_msg *msg) { struct msg_security_struct *msec; msec = selinux_msg_msg(msg); msec->sid = SECINITSID_UNLABELED; return 0; } /* message queue security operations */ static int selinux_msg_queue_alloc_security(struct kern_ipc_perm *msq) { struct ipc_security_struct *isec; struct common_audit_data ad; u32 sid = current_sid(); isec = selinux_ipc(msq); ipc_init_security(isec, SECCLASS_MSGQ); ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = msq->key; return avc_has_perm(&selinux_state, sid, isec->sid, SECCLASS_MSGQ, MSGQ__CREATE, &ad); } static int selinux_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg) { struct ipc_security_struct *isec; struct common_audit_data ad; u32 sid = current_sid(); isec = selinux_ipc(msq); ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = msq->key; return avc_has_perm(&selinux_state, sid, isec->sid, SECCLASS_MSGQ, MSGQ__ASSOCIATE, &ad); } static int selinux_msg_queue_msgctl(struct kern_ipc_perm *msq, int cmd) { int err; int perms; switch (cmd) { case IPC_INFO: case MSG_INFO: /* No specific object, just general system-wide information. */ return avc_has_perm(&selinux_state, current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL); case IPC_STAT: case MSG_STAT: case MSG_STAT_ANY: perms = MSGQ__GETATTR | MSGQ__ASSOCIATE; break; case IPC_SET: perms = MSGQ__SETATTR; break; case IPC_RMID: perms = MSGQ__DESTROY; break; default: return 0; } err = ipc_has_perm(msq, perms); return err; } static int selinux_msg_queue_msgsnd(struct kern_ipc_perm *msq, struct msg_msg *msg, int msqflg) { struct ipc_security_struct *isec; struct msg_security_struct *msec; struct common_audit_data ad; u32 sid = current_sid(); int rc; isec = selinux_ipc(msq); msec = selinux_msg_msg(msg); /* * First time through, need to assign label to the message */ if (msec->sid == SECINITSID_UNLABELED) { /* * Compute new sid based on current process and * message queue this message will be stored in */ rc = security_transition_sid(&selinux_state, sid, isec->sid, SECCLASS_MSG, NULL, &msec->sid); if (rc) return rc; } ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = msq->key; /* Can this process write to the queue? */ rc = avc_has_perm(&selinux_state, sid, isec->sid, SECCLASS_MSGQ, MSGQ__WRITE, &ad); if (!rc) /* Can this process send the message */ rc = avc_has_perm(&selinux_state, sid, msec->sid, SECCLASS_MSG, MSG__SEND, &ad); if (!rc) /* Can the message be put in the queue? */ rc = avc_has_perm(&selinux_state, msec->sid, isec->sid, SECCLASS_MSGQ, MSGQ__ENQUEUE, &ad); return rc; } static int selinux_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *msg, struct task_struct *target, long type, int mode) { struct ipc_security_struct *isec; struct msg_security_struct *msec; struct common_audit_data ad; u32 sid = task_sid_obj(target); int rc; isec = selinux_ipc(msq); msec = selinux_msg_msg(msg); ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = msq->key; rc = avc_has_perm(&selinux_state, sid, isec->sid, SECCLASS_MSGQ, MSGQ__READ, &ad); if (!rc) rc = avc_has_perm(&selinux_state, sid, msec->sid, SECCLASS_MSG, MSG__RECEIVE, &ad); return rc; } /* Shared Memory security operations */ static int selinux_shm_alloc_security(struct kern_ipc_perm *shp) { struct ipc_security_struct *isec; struct common_audit_data ad; u32 sid = current_sid(); isec = selinux_ipc(shp); ipc_init_security(isec, SECCLASS_SHM); ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = shp->key; return avc_has_perm(&selinux_state, sid, isec->sid, SECCLASS_SHM, SHM__CREATE, &ad); } static int selinux_shm_associate(struct kern_ipc_perm *shp, int shmflg) { struct ipc_security_struct *isec; struct common_audit_data ad; u32 sid = current_sid(); isec = selinux_ipc(shp); ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = shp->key; return avc_has_perm(&selinux_state, sid, isec->sid, SECCLASS_SHM, SHM__ASSOCIATE, &ad); } /* Note, at this point, shp is locked down */ static int selinux_shm_shmctl(struct kern_ipc_perm *shp, int cmd) { int perms; int err; switch (cmd) { case IPC_INFO: case SHM_INFO: /* No specific object, just general system-wide information. */ return avc_has_perm(&selinux_state, current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL); case IPC_STAT: case SHM_STAT: case SHM_STAT_ANY: perms = SHM__GETATTR | SHM__ASSOCIATE; break; case IPC_SET: perms = SHM__SETATTR; break; case SHM_LOCK: case SHM_UNLOCK: perms = SHM__LOCK; break; case IPC_RMID: perms = SHM__DESTROY; break; default: return 0; } err = ipc_has_perm(shp, perms); return err; } static int selinux_shm_shmat(struct kern_ipc_perm *shp, char __user *shmaddr, int shmflg) { u32 perms; if (shmflg & SHM_RDONLY) perms = SHM__READ; else perms = SHM__READ | SHM__WRITE; return ipc_has_perm(shp, perms); } /* Semaphore security operations */ static int selinux_sem_alloc_security(struct kern_ipc_perm *sma) { struct ipc_security_struct *isec; struct common_audit_data ad; u32 sid = current_sid(); isec = selinux_ipc(sma); ipc_init_security(isec, SECCLASS_SEM); ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = sma->key; return avc_has_perm(&selinux_state, sid, isec->sid, SECCLASS_SEM, SEM__CREATE, &ad); } static int selinux_sem_associate(struct kern_ipc_perm *sma, int semflg) { struct ipc_security_struct *isec; struct common_audit_data ad; u32 sid = current_sid(); isec = selinux_ipc(sma); ad.type = LSM_AUDIT_DATA_IPC; ad.u.ipc_id = sma->key; return avc_has_perm(&selinux_state, sid, isec->sid, SECCLASS_SEM, SEM__ASSOCIATE, &ad); } /* Note, at this point, sma is locked down */ static int selinux_sem_semctl(struct kern_ipc_perm *sma, int cmd) { int err; u32 perms; switch (cmd) { case IPC_INFO: case SEM_INFO: /* No specific object, just general system-wide information. */ return avc_has_perm(&selinux_state, current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL); case GETPID: case GETNCNT: case GETZCNT: perms = SEM__GETATTR; break; case GETVAL: case GETALL: perms = SEM__READ; break; case SETVAL: case SETALL: perms = SEM__WRITE; break; case IPC_RMID: perms = SEM__DESTROY; break; case IPC_SET: perms = SEM__SETATTR; break; case IPC_STAT: case SEM_STAT: case SEM_STAT_ANY: perms = SEM__GETATTR | SEM__ASSOCIATE; break; default: return 0; } err = ipc_has_perm(sma, perms); return err; } static int selinux_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops, unsigned nsops, int alter) { u32 perms; if (alter) perms = SEM__READ | SEM__WRITE; else perms = SEM__READ; return ipc_has_perm(sma, perms); } static int selinux_ipc_permission(struct kern_ipc_perm *ipcp, short flag) { u32 av = 0; av = 0; if (flag & S_IRUGO) av |= IPC__UNIX_READ; if (flag & S_IWUGO) av |= IPC__UNIX_WRITE; if (av == 0) return 0; return ipc_has_perm(ipcp, av); } static void selinux_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid) { struct ipc_security_struct *isec = selinux_ipc(ipcp); *secid = isec->sid; } static void selinux_d_instantiate(struct dentry *dentry, struct inode *inode) { if (inode) inode_doinit_with_dentry(inode, dentry); } static int selinux_getprocattr(struct task_struct *p, const char *name, char **value) { const struct task_security_struct *__tsec; u32 sid; int error; unsigned len; rcu_read_lock(); __tsec = selinux_cred(__task_cred(p)); if (current != p) { error = avc_has_perm(&selinux_state, current_sid(), __tsec->sid, SECCLASS_PROCESS, PROCESS__GETATTR, NULL); if (error) goto bad; } if (!strcmp(name, "current")) sid = __tsec->sid; else if (!strcmp(name, "prev")) sid = __tsec->osid; else if (!strcmp(name, "exec")) sid = __tsec->exec_sid; else if (!strcmp(name, "fscreate")) sid = __tsec->create_sid; else if (!strcmp(name, "keycreate")) sid = __tsec->keycreate_sid; else if (!strcmp(name, "sockcreate")) sid = __tsec->sockcreate_sid; else { error = -EINVAL; goto bad; } rcu_read_unlock(); if (!sid) return 0; error = security_sid_to_context(&selinux_state, sid, value, &len); if (error) return error; return len; bad: rcu_read_unlock(); return error; } static int selinux_setprocattr(const char *name, void *value, size_t size) { struct task_security_struct *tsec; struct cred *new; u32 mysid = current_sid(), sid = 0, ptsid; int error; char *str = value; /* * Basic control over ability to set these attributes at all. */ if (!strcmp(name, "exec")) error = avc_has_perm(&selinux_state, mysid, mysid, SECCLASS_PROCESS, PROCESS__SETEXEC, NULL); else if (!strcmp(name, "fscreate")) error = avc_has_perm(&selinux_state, mysid, mysid, SECCLASS_PROCESS, PROCESS__SETFSCREATE, NULL); else if (!strcmp(name, "keycreate")) error = avc_has_perm(&selinux_state, mysid, mysid, SECCLASS_PROCESS, PROCESS__SETKEYCREATE, NULL); else if (!strcmp(name, "sockcreate")) error = avc_has_perm(&selinux_state, mysid, mysid, SECCLASS_PROCESS, PROCESS__SETSOCKCREATE, NULL); else if (!strcmp(name, "current")) error = avc_has_perm(&selinux_state, mysid, mysid, SECCLASS_PROCESS, PROCESS__SETCURRENT, NULL); else error = -EINVAL; if (error) return error; /* Obtain a SID for the context, if one was specified. */ if (size && str[0] && str[0] != '\n') { if (str[size-1] == '\n') { str[size-1] = 0; size--; } error = security_context_to_sid(&selinux_state, value, size, &sid, GFP_KERNEL); if (error == -EINVAL && !strcmp(name, "fscreate")) { if (!has_cap_mac_admin(true)) { struct audit_buffer *ab; size_t audit_size; /* We strip a nul only if it is at the end, otherwise the * context contains a nul and we should audit that */ if (str[size - 1] == '\0') audit_size = size - 1; else audit_size = size; ab = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR); if (!ab) return error; audit_log_format(ab, "op=fscreate invalid_context="); audit_log_n_untrustedstring(ab, value, audit_size); audit_log_end(ab); return error; } error = security_context_to_sid_force( &selinux_state, value, size, &sid); } if (error) return error; } new = prepare_creds(); if (!new) return -ENOMEM; /* Permission checking based on the specified context is performed during the actual operation (execve, open/mkdir/...), when we know the full context of the operation. See selinux_bprm_creds_for_exec for the execve checks and may_create for the file creation checks. The operation will then fail if the context is not permitted. */ tsec = selinux_cred(new); if (!strcmp(name, "exec")) { tsec->exec_sid = sid; } else if (!strcmp(name, "fscreate")) { tsec->create_sid = sid; } else if (!strcmp(name, "keycreate")) { if (sid) { error = avc_has_perm(&selinux_state, mysid, sid, SECCLASS_KEY, KEY__CREATE, NULL); if (error) goto abort_change; } tsec->keycreate_sid = sid; } else if (!strcmp(name, "sockcreate")) { tsec->sockcreate_sid = sid; } else if (!strcmp(name, "current")) { error = -EINVAL; if (sid == 0) goto abort_change; /* Only allow single threaded processes to change context */ if (!current_is_single_threaded()) { error = security_bounded_transition(&selinux_state, tsec->sid, sid); if (error) goto abort_change; } /* Check permissions for the transition. */ error = avc_has_perm(&selinux_state, tsec->sid, sid, SECCLASS_PROCESS, PROCESS__DYNTRANSITION, NULL); if (error) goto abort_change; /* Check for ptracing, and update the task SID if ok. Otherwise, leave SID unchanged and fail. */ ptsid = ptrace_parent_sid(); if (ptsid != 0) { error = avc_has_perm(&selinux_state, ptsid, sid, SECCLASS_PROCESS, PROCESS__PTRACE, NULL); if (error) goto abort_change; } tsec->sid = sid; } else { error = -EINVAL; goto abort_change; } commit_creds(new); return size; abort_change: abort_creds(new); return error; } static int selinux_ismaclabel(const char *name) { return (strcmp(name, XATTR_SELINUX_SUFFIX) == 0); } static int selinux_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) { return security_sid_to_context(&selinux_state, secid, secdata, seclen); } static int selinux_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid) { return security_context_to_sid(&selinux_state, secdata, seclen, secid, GFP_KERNEL); } static void selinux_release_secctx(char *secdata, u32 seclen) { kfree(secdata); } static void selinux_inode_invalidate_secctx(struct inode *inode) { struct inode_security_struct *isec = selinux_inode(inode); spin_lock(&isec->lock); isec->initialized = LABEL_INVALID; spin_unlock(&isec->lock); } /* * called with inode->i_mutex locked */ static int selinux_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen) { int rc = selinux_inode_setsecurity(inode, XATTR_SELINUX_SUFFIX, ctx, ctxlen, 0); /* Do not return error when suppressing label (SBLABEL_MNT not set). */ return rc == -EOPNOTSUPP ? 0 : rc; } /* * called with inode->i_mutex locked */ static int selinux_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen) { return __vfs_setxattr_locked(&init_user_ns, dentry, XATTR_NAME_SELINUX, ctx, ctxlen, 0, NULL); } static int selinux_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen) { int len = 0; len = selinux_inode_getsecurity(&init_user_ns, inode, XATTR_SELINUX_SUFFIX, ctx, true); if (len < 0) return len; *ctxlen = len; return 0; } #ifdef CONFIG_KEYS static int selinux_key_alloc(struct key *k, const struct cred *cred, unsigned long flags) { const struct task_security_struct *tsec; struct key_security_struct *ksec; ksec = kzalloc(sizeof(struct key_security_struct), GFP_KERNEL); if (!ksec) return -ENOMEM; tsec = selinux_cred(cred); if (tsec->keycreate_sid) ksec->sid = tsec->keycreate_sid; else ksec->sid = tsec->sid; k->security = ksec; return 0; } static void selinux_key_free(struct key *k) { struct key_security_struct *ksec = k->security; k->security = NULL; kfree(ksec); } static int selinux_key_permission(key_ref_t key_ref, const struct cred *cred, enum key_need_perm need_perm) { struct key *key; struct key_security_struct *ksec; u32 perm, sid; switch (need_perm) { case KEY_NEED_VIEW: perm = KEY__VIEW; break; case KEY_NEED_READ: perm = KEY__READ; break; case KEY_NEED_WRITE: perm = KEY__WRITE; break; case KEY_NEED_SEARCH: perm = KEY__SEARCH; break; case KEY_NEED_LINK: perm = KEY__LINK; break; case KEY_NEED_SETATTR: perm = KEY__SETATTR; break; case KEY_NEED_UNLINK: case KEY_SYSADMIN_OVERRIDE: case KEY_AUTHTOKEN_OVERRIDE: case KEY_DEFER_PERM_CHECK: return 0; default: WARN_ON(1); return -EPERM; } sid = cred_sid(cred); key = key_ref_to_ptr(key_ref); ksec = key->security; return avc_has_perm(&selinux_state, sid, ksec->sid, SECCLASS_KEY, perm, NULL); } static int selinux_key_getsecurity(struct key *key, char **_buffer) { struct key_security_struct *ksec = key->security; char *context = NULL; unsigned len; int rc; rc = security_sid_to_context(&selinux_state, ksec->sid, &context, &len); if (!rc) rc = len; *_buffer = context; return rc; } #ifdef CONFIG_KEY_NOTIFICATIONS static int selinux_watch_key(struct key *key) { struct key_security_struct *ksec = key->security; u32 sid = current_sid(); return avc_has_perm(&selinux_state, sid, ksec->sid, SECCLASS_KEY, KEY__VIEW, NULL); } #endif #endif #ifdef CONFIG_SECURITY_INFINIBAND static int selinux_ib_pkey_access(void *ib_sec, u64 subnet_prefix, u16 pkey_val) { struct common_audit_data ad; int err; u32 sid = 0; struct ib_security_struct *sec = ib_sec; struct lsm_ibpkey_audit ibpkey; err = sel_ib_pkey_sid(subnet_prefix, pkey_val, &sid); if (err) return err; ad.type = LSM_AUDIT_DATA_IBPKEY; ibpkey.subnet_prefix = subnet_prefix; ibpkey.pkey = pkey_val; ad.u.ibpkey = &ibpkey; return avc_has_perm(&selinux_state, sec->sid, sid, SECCLASS_INFINIBAND_PKEY, INFINIBAND_PKEY__ACCESS, &ad); } static int selinux_ib_endport_manage_subnet(void *ib_sec, const char *dev_name, u8 port_num) { struct common_audit_data ad; int err; u32 sid = 0; struct ib_security_struct *sec = ib_sec; struct lsm_ibendport_audit ibendport; err = security_ib_endport_sid(&selinux_state, dev_name, port_num, &sid); if (err) return err; ad.type = LSM_AUDIT_DATA_IBENDPORT; ibendport.dev_name = dev_name; ibendport.port = port_num; ad.u.ibendport = &ibendport; return avc_has_perm(&selinux_state, sec->sid, sid, SECCLASS_INFINIBAND_ENDPORT, INFINIBAND_ENDPORT__MANAGE_SUBNET, &ad); } static int selinux_ib_alloc_security(void **ib_sec) { struct ib_security_struct *sec; sec = kzalloc(sizeof(*sec), GFP_KERNEL); if (!sec) return -ENOMEM; sec->sid = current_sid(); *ib_sec = sec; return 0; } static void selinux_ib_free_security(void *ib_sec) { kfree(ib_sec); } #endif #ifdef CONFIG_BPF_SYSCALL static int selinux_bpf(int cmd, union bpf_attr *attr, unsigned int size) { u32 sid = current_sid(); int ret; switch (cmd) { case BPF_MAP_CREATE: ret = avc_has_perm(&selinux_state, sid, sid, SECCLASS_BPF, BPF__MAP_CREATE, NULL); break; case BPF_PROG_LOAD: ret = avc_has_perm(&selinux_state, sid, sid, SECCLASS_BPF, BPF__PROG_LOAD, NULL); break; default: ret = 0; break; } return ret; } static u32 bpf_map_fmode_to_av(fmode_t fmode) { u32 av = 0; if (fmode & FMODE_READ) av |= BPF__MAP_READ; if (fmode & FMODE_WRITE) av |= BPF__MAP_WRITE; return av; } /* This function will check the file pass through unix socket or binder to see * if it is a bpf related object. And apply corresponding checks on the bpf * object based on the type. The bpf maps and programs, not like other files and * socket, are using a shared anonymous inode inside the kernel as their inode. * So checking that inode cannot identify if the process have privilege to * access the bpf object and that's why we have to add this additional check in * selinux_file_receive and selinux_binder_transfer_files. */ static int bpf_fd_pass(struct file *file, u32 sid) { struct bpf_security_struct *bpfsec; struct bpf_prog *prog; struct bpf_map *map; int ret; if (file->f_op == &bpf_map_fops) { map = file->private_data; bpfsec = map->security; ret = avc_has_perm(&selinux_state, sid, bpfsec->sid, SECCLASS_BPF, bpf_map_fmode_to_av(file->f_mode), NULL); if (ret) return ret; } else if (file->f_op == &bpf_prog_fops) { prog = file->private_data; bpfsec = prog->aux->security; ret = avc_has_perm(&selinux_state, sid, bpfsec->sid, SECCLASS_BPF, BPF__PROG_RUN, NULL); if (ret) return ret; } return 0; } static int selinux_bpf_map(struct bpf_map *map, fmode_t fmode) { u32 sid = current_sid(); struct bpf_security_struct *bpfsec; bpfsec = map->security; return avc_has_perm(&selinux_state, sid, bpfsec->sid, SECCLASS_BPF, bpf_map_fmode_to_av(fmode), NULL); } static int selinux_bpf_prog(struct bpf_prog *prog) { u32 sid = current_sid(); struct bpf_security_struct *bpfsec; bpfsec = prog->aux->security; return avc_has_perm(&selinux_state, sid, bpfsec->sid, SECCLASS_BPF, BPF__PROG_RUN, NULL); } static int selinux_bpf_map_alloc(struct bpf_map *map) { struct bpf_security_struct *bpfsec; bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL); if (!bpfsec) return -ENOMEM; bpfsec->sid = current_sid(); map->security = bpfsec; return 0; } static void selinux_bpf_map_free(struct bpf_map *map) { struct bpf_security_struct *bpfsec = map->security; map->security = NULL; kfree(bpfsec); } static int selinux_bpf_prog_alloc(struct bpf_prog_aux *aux) { struct bpf_security_struct *bpfsec; bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL); if (!bpfsec) return -ENOMEM; bpfsec->sid = current_sid(); aux->security = bpfsec; return 0; } static void selinux_bpf_prog_free(struct bpf_prog_aux *aux) { struct bpf_security_struct *bpfsec = aux->security; aux->security = NULL; kfree(bpfsec); } #endif struct lsm_blob_sizes selinux_blob_sizes __lsm_ro_after_init = { .lbs_cred = sizeof(struct task_security_struct), .lbs_file = sizeof(struct file_security_struct), .lbs_inode = sizeof(struct inode_security_struct), .lbs_ipc = sizeof(struct ipc_security_struct), .lbs_msg_msg = sizeof(struct msg_security_struct), .lbs_superblock = sizeof(struct superblock_security_struct), }; #ifdef CONFIG_PERF_EVENTS static int selinux_perf_event_open(struct perf_event_attr *attr, int type) { u32 requested, sid = current_sid(); if (type == PERF_SECURITY_OPEN) requested = PERF_EVENT__OPEN; else if (type == PERF_SECURITY_CPU) requested = PERF_EVENT__CPU; else if (type == PERF_SECURITY_KERNEL) requested = PERF_EVENT__KERNEL; else if (type == PERF_SECURITY_TRACEPOINT) requested = PERF_EVENT__TRACEPOINT; else return -EINVAL; return avc_has_perm(&selinux_state, sid, sid, SECCLASS_PERF_EVENT, requested, NULL); } static int selinux_perf_event_alloc(struct perf_event *event) { struct perf_event_security_struct *perfsec; perfsec = kzalloc(sizeof(*perfsec), GFP_KERNEL); if (!perfsec) return -ENOMEM; perfsec->sid = current_sid(); event->security = perfsec; return 0; } static void selinux_perf_event_free(struct perf_event *event) { struct perf_event_security_struct *perfsec = event->security; event->security = NULL; kfree(perfsec); } static int selinux_perf_event_read(struct perf_event *event) { struct perf_event_security_struct *perfsec = event->security; u32 sid = current_sid(); return avc_has_perm(&selinux_state, sid, perfsec->sid, SECCLASS_PERF_EVENT, PERF_EVENT__READ, NULL); } static int selinux_perf_event_write(struct perf_event *event) { struct perf_event_security_struct *perfsec = event->security; u32 sid = current_sid(); return avc_has_perm(&selinux_state, sid, perfsec->sid, SECCLASS_PERF_EVENT, PERF_EVENT__WRITE, NULL); } #endif #ifdef CONFIG_IO_URING /** * selinux_uring_override_creds - check the requested cred override * @new: the target creds * * Check to see if the current task is allowed to override it's credentials * to service an io_uring operation. */ static int selinux_uring_override_creds(const struct cred *new) { return avc_has_perm(&selinux_state, current_sid(), cred_sid(new), SECCLASS_IO_URING, IO_URING__OVERRIDE_CREDS, NULL); } /** * selinux_uring_sqpoll - check if a io_uring polling thread can be created * * Check to see if the current task is allowed to create a new io_uring * kernel polling thread. */ static int selinux_uring_sqpoll(void) { int sid = current_sid(); return avc_has_perm(&selinux_state, sid, sid, SECCLASS_IO_URING, IO_URING__SQPOLL, NULL); } /** * selinux_uring_cmd - check if IORING_OP_URING_CMD is allowed * @ioucmd: the io_uring command structure * * Check to see if the current domain is allowed to execute an * IORING_OP_URING_CMD against the device/file specified in @ioucmd. * */ static int selinux_uring_cmd(struct io_uring_cmd *ioucmd) { struct file *file = ioucmd->file; struct inode *inode = file_inode(file); struct inode_security_struct *isec = selinux_inode(inode); struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_FILE; ad.u.file = file; return avc_has_perm(&selinux_state, current_sid(), isec->sid, SECCLASS_IO_URING, IO_URING__CMD, &ad); } #endif /* CONFIG_IO_URING */ /* * IMPORTANT NOTE: When adding new hooks, please be careful to keep this order: * 1. any hooks that don't belong to (2.) or (3.) below, * 2. hooks that both access structures allocated by other hooks, and allocate * structures that can be later accessed by other hooks (mostly "cloning" * hooks), * 3. hooks that only allocate structures that can be later accessed by other * hooks ("allocating" hooks). * * Please follow block comment delimiters in the list to keep this order. * * This ordering is needed for SELinux runtime disable to work at least somewhat * safely. Breaking the ordering rules above might lead to NULL pointer derefs * when disabling SELinux at runtime. */ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(binder_set_context_mgr, selinux_binder_set_context_mgr), LSM_HOOK_INIT(binder_transaction, selinux_binder_transaction), LSM_HOOK_INIT(binder_transfer_binder, selinux_binder_transfer_binder), LSM_HOOK_INIT(binder_transfer_file, selinux_binder_transfer_file), LSM_HOOK_INIT(ptrace_access_check, selinux_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, selinux_ptrace_traceme), LSM_HOOK_INIT(capget, selinux_capget), LSM_HOOK_INIT(capset, selinux_capset), LSM_HOOK_INIT(capable, selinux_capable), LSM_HOOK_INIT(quotactl, selinux_quotactl), LSM_HOOK_INIT(quota_on, selinux_quota_on), LSM_HOOK_INIT(syslog, selinux_syslog), LSM_HOOK_INIT(vm_enough_memory, selinux_vm_enough_memory), LSM_HOOK_INIT(netlink_send, selinux_netlink_send), LSM_HOOK_INIT(bprm_creds_for_exec, selinux_bprm_creds_for_exec), LSM_HOOK_INIT(bprm_committing_creds, selinux_bprm_committing_creds), LSM_HOOK_INIT(bprm_committed_creds, selinux_bprm_committed_creds), LSM_HOOK_INIT(sb_free_mnt_opts, selinux_free_mnt_opts), LSM_HOOK_INIT(sb_mnt_opts_compat, selinux_sb_mnt_opts_compat), LSM_HOOK_INIT(sb_remount, selinux_sb_remount), LSM_HOOK_INIT(sb_kern_mount, selinux_sb_kern_mount), LSM_HOOK_INIT(sb_show_options, selinux_sb_show_options), LSM_HOOK_INIT(sb_statfs, selinux_sb_statfs), LSM_HOOK_INIT(sb_mount, selinux_mount), LSM_HOOK_INIT(sb_umount, selinux_umount), LSM_HOOK_INIT(sb_set_mnt_opts, selinux_set_mnt_opts), LSM_HOOK_INIT(sb_clone_mnt_opts, selinux_sb_clone_mnt_opts), LSM_HOOK_INIT(move_mount, selinux_move_mount), LSM_HOOK_INIT(dentry_init_security, selinux_dentry_init_security), LSM_HOOK_INIT(dentry_create_files_as, selinux_dentry_create_files_as), LSM_HOOK_INIT(inode_free_security, selinux_inode_free_security), LSM_HOOK_INIT(inode_init_security, selinux_inode_init_security), LSM_HOOK_INIT(inode_init_security_anon, selinux_inode_init_security_anon), LSM_HOOK_INIT(inode_create, selinux_inode_create), LSM_HOOK_INIT(inode_link, selinux_inode_link), LSM_HOOK_INIT(inode_unlink, selinux_inode_unlink), LSM_HOOK_INIT(inode_symlink, selinux_inode_symlink), LSM_HOOK_INIT(inode_mkdir, selinux_inode_mkdir), LSM_HOOK_INIT(inode_rmdir, selinux_inode_rmdir), LSM_HOOK_INIT(inode_mknod, selinux_inode_mknod), LSM_HOOK_INIT(inode_rename, selinux_inode_rename), LSM_HOOK_INIT(inode_readlink, selinux_inode_readlink), LSM_HOOK_INIT(inode_follow_link, selinux_inode_follow_link), LSM_HOOK_INIT(inode_permission, selinux_inode_permission), LSM_HOOK_INIT(inode_setattr, selinux_inode_setattr), LSM_HOOK_INIT(inode_getattr, selinux_inode_getattr), LSM_HOOK_INIT(inode_setxattr, selinux_inode_setxattr), LSM_HOOK_INIT(inode_post_setxattr, selinux_inode_post_setxattr), LSM_HOOK_INIT(inode_getxattr, selinux_inode_getxattr), LSM_HOOK_INIT(inode_listxattr, selinux_inode_listxattr), LSM_HOOK_INIT(inode_removexattr, selinux_inode_removexattr), LSM_HOOK_INIT(inode_getsecurity, selinux_inode_getsecurity), LSM_HOOK_INIT(inode_setsecurity, selinux_inode_setsecurity), LSM_HOOK_INIT(inode_listsecurity, selinux_inode_listsecurity), LSM_HOOK_INIT(inode_getsecid, selinux_inode_getsecid), LSM_HOOK_INIT(inode_copy_up, selinux_inode_copy_up), LSM_HOOK_INIT(inode_copy_up_xattr, selinux_inode_copy_up_xattr), LSM_HOOK_INIT(path_notify, selinux_path_notify), LSM_HOOK_INIT(kernfs_init_security, selinux_kernfs_init_security), LSM_HOOK_INIT(file_permission, selinux_file_permission), LSM_HOOK_INIT(file_alloc_security, selinux_file_alloc_security), LSM_HOOK_INIT(file_ioctl, selinux_file_ioctl), LSM_HOOK_INIT(file_ioctl_compat, selinux_file_ioctl_compat), LSM_HOOK_INIT(mmap_file, selinux_mmap_file), LSM_HOOK_INIT(mmap_addr, selinux_mmap_addr), LSM_HOOK_INIT(file_mprotect, selinux_file_mprotect), LSM_HOOK_INIT(file_lock, selinux_file_lock), LSM_HOOK_INIT(file_fcntl, selinux_file_fcntl), LSM_HOOK_INIT(file_set_fowner, selinux_file_set_fowner), LSM_HOOK_INIT(file_send_sigiotask, selinux_file_send_sigiotask), LSM_HOOK_INIT(file_receive, selinux_file_receive), LSM_HOOK_INIT(file_open, selinux_file_open), LSM_HOOK_INIT(task_alloc, selinux_task_alloc), LSM_HOOK_INIT(cred_prepare, selinux_cred_prepare), LSM_HOOK_INIT(cred_transfer, selinux_cred_transfer), LSM_HOOK_INIT(cred_getsecid, selinux_cred_getsecid), LSM_HOOK_INIT(kernel_act_as, selinux_kernel_act_as), LSM_HOOK_INIT(kernel_create_files_as, selinux_kernel_create_files_as), LSM_HOOK_INIT(kernel_module_request, selinux_kernel_module_request), LSM_HOOK_INIT(kernel_load_data, selinux_kernel_load_data), LSM_HOOK_INIT(kernel_read_file, selinux_kernel_read_file), LSM_HOOK_INIT(task_setpgid, selinux_task_setpgid), LSM_HOOK_INIT(task_getpgid, selinux_task_getpgid), LSM_HOOK_INIT(task_getsid, selinux_task_getsid), LSM_HOOK_INIT(current_getsecid_subj, selinux_current_getsecid_subj), LSM_HOOK_INIT(task_getsecid_obj, selinux_task_getsecid_obj), LSM_HOOK_INIT(task_setnice, selinux_task_setnice), LSM_HOOK_INIT(task_setioprio, selinux_task_setioprio), LSM_HOOK_INIT(task_getioprio, selinux_task_getioprio), LSM_HOOK_INIT(task_prlimit, selinux_task_prlimit), LSM_HOOK_INIT(task_setrlimit, selinux_task_setrlimit), LSM_HOOK_INIT(task_setscheduler, selinux_task_setscheduler), LSM_HOOK_INIT(task_getscheduler, selinux_task_getscheduler), LSM_HOOK_INIT(task_movememory, selinux_task_movememory), LSM_HOOK_INIT(task_kill, selinux_task_kill), LSM_HOOK_INIT(task_to_inode, selinux_task_to_inode), LSM_HOOK_INIT(userns_create, selinux_userns_create), LSM_HOOK_INIT(ipc_permission, selinux_ipc_permission), LSM_HOOK_INIT(ipc_getsecid, selinux_ipc_getsecid), LSM_HOOK_INIT(msg_queue_associate, selinux_msg_queue_associate), LSM_HOOK_INIT(msg_queue_msgctl, selinux_msg_queue_msgctl), LSM_HOOK_INIT(msg_queue_msgsnd, selinux_msg_queue_msgsnd), LSM_HOOK_INIT(msg_queue_msgrcv, selinux_msg_queue_msgrcv), LSM_HOOK_INIT(shm_associate, selinux_shm_associate), LSM_HOOK_INIT(shm_shmctl, selinux_shm_shmctl), LSM_HOOK_INIT(shm_shmat, selinux_shm_shmat), LSM_HOOK_INIT(sem_associate, selinux_sem_associate), LSM_HOOK_INIT(sem_semctl, selinux_sem_semctl), LSM_HOOK_INIT(sem_semop, selinux_sem_semop), LSM_HOOK_INIT(d_instantiate, selinux_d_instantiate), LSM_HOOK_INIT(getprocattr, selinux_getprocattr), LSM_HOOK_INIT(setprocattr, selinux_setprocattr), LSM_HOOK_INIT(ismaclabel, selinux_ismaclabel), LSM_HOOK_INIT(secctx_to_secid, selinux_secctx_to_secid), LSM_HOOK_INIT(release_secctx, selinux_release_secctx), LSM_HOOK_INIT(inode_invalidate_secctx, selinux_inode_invalidate_secctx), LSM_HOOK_INIT(inode_notifysecctx, selinux_inode_notifysecctx), LSM_HOOK_INIT(inode_setsecctx, selinux_inode_setsecctx), LSM_HOOK_INIT(unix_stream_connect, selinux_socket_unix_stream_connect), LSM_HOOK_INIT(unix_may_send, selinux_socket_unix_may_send), LSM_HOOK_INIT(socket_create, selinux_socket_create), LSM_HOOK_INIT(socket_post_create, selinux_socket_post_create), LSM_HOOK_INIT(socket_socketpair, selinux_socket_socketpair), LSM_HOOK_INIT(socket_bind, selinux_socket_bind), LSM_HOOK_INIT(socket_connect, selinux_socket_connect), LSM_HOOK_INIT(socket_listen, selinux_socket_listen), LSM_HOOK_INIT(socket_accept, selinux_socket_accept), LSM_HOOK_INIT(socket_sendmsg, selinux_socket_sendmsg), LSM_HOOK_INIT(socket_recvmsg, selinux_socket_recvmsg), LSM_HOOK_INIT(socket_getsockname, selinux_socket_getsockname), LSM_HOOK_INIT(socket_getpeername, selinux_socket_getpeername), LSM_HOOK_INIT(socket_getsockopt, selinux_socket_getsockopt), LSM_HOOK_INIT(socket_setsockopt, selinux_socket_setsockopt), LSM_HOOK_INIT(socket_shutdown, selinux_socket_shutdown), LSM_HOOK_INIT(socket_sock_rcv_skb, selinux_socket_sock_rcv_skb), LSM_HOOK_INIT(socket_getpeersec_stream, selinux_socket_getpeersec_stream), LSM_HOOK_INIT(socket_getpeersec_dgram, selinux_socket_getpeersec_dgram), LSM_HOOK_INIT(sk_free_security, selinux_sk_free_security), LSM_HOOK_INIT(sk_clone_security, selinux_sk_clone_security), LSM_HOOK_INIT(sk_getsecid, selinux_sk_getsecid), LSM_HOOK_INIT(sock_graft, selinux_sock_graft), LSM_HOOK_INIT(sctp_assoc_request, selinux_sctp_assoc_request), LSM_HOOK_INIT(sctp_sk_clone, selinux_sctp_sk_clone), LSM_HOOK_INIT(sctp_bind_connect, selinux_sctp_bind_connect), LSM_HOOK_INIT(sctp_assoc_established, selinux_sctp_assoc_established), LSM_HOOK_INIT(inet_conn_request, selinux_inet_conn_request), LSM_HOOK_INIT(inet_csk_clone, selinux_inet_csk_clone), LSM_HOOK_INIT(inet_conn_established, selinux_inet_conn_established), LSM_HOOK_INIT(secmark_relabel_packet, selinux_secmark_relabel_packet), LSM_HOOK_INIT(secmark_refcount_inc, selinux_secmark_refcount_inc), LSM_HOOK_INIT(secmark_refcount_dec, selinux_secmark_refcount_dec), LSM_HOOK_INIT(req_classify_flow, selinux_req_classify_flow), LSM_HOOK_INIT(tun_dev_free_security, selinux_tun_dev_free_security), LSM_HOOK_INIT(tun_dev_create, selinux_tun_dev_create), LSM_HOOK_INIT(tun_dev_attach_queue, selinux_tun_dev_attach_queue), LSM_HOOK_INIT(tun_dev_attach, selinux_tun_dev_attach), LSM_HOOK_INIT(tun_dev_open, selinux_tun_dev_open), #ifdef CONFIG_SECURITY_INFINIBAND LSM_HOOK_INIT(ib_pkey_access, selinux_ib_pkey_access), LSM_HOOK_INIT(ib_endport_manage_subnet, selinux_ib_endport_manage_subnet), LSM_HOOK_INIT(ib_free_security, selinux_ib_free_security), #endif #ifdef CONFIG_SECURITY_NETWORK_XFRM LSM_HOOK_INIT(xfrm_policy_free_security, selinux_xfrm_policy_free), LSM_HOOK_INIT(xfrm_policy_delete_security, selinux_xfrm_policy_delete), LSM_HOOK_INIT(xfrm_state_free_security, selinux_xfrm_state_free), LSM_HOOK_INIT(xfrm_state_delete_security, selinux_xfrm_state_delete), LSM_HOOK_INIT(xfrm_policy_lookup, selinux_xfrm_policy_lookup), LSM_HOOK_INIT(xfrm_state_pol_flow_match, selinux_xfrm_state_pol_flow_match), LSM_HOOK_INIT(xfrm_decode_session, selinux_xfrm_decode_session), #endif #ifdef CONFIG_KEYS LSM_HOOK_INIT(key_free, selinux_key_free), LSM_HOOK_INIT(key_permission, selinux_key_permission), LSM_HOOK_INIT(key_getsecurity, selinux_key_getsecurity), #ifdef CONFIG_KEY_NOTIFICATIONS LSM_HOOK_INIT(watch_key, selinux_watch_key), #endif #endif #ifdef CONFIG_AUDIT LSM_HOOK_INIT(audit_rule_known, selinux_audit_rule_known), LSM_HOOK_INIT(audit_rule_match, selinux_audit_rule_match), LSM_HOOK_INIT(audit_rule_free, selinux_audit_rule_free), #endif #ifdef CONFIG_BPF_SYSCALL LSM_HOOK_INIT(bpf, selinux_bpf), LSM_HOOK_INIT(bpf_map, selinux_bpf_map), LSM_HOOK_INIT(bpf_prog, selinux_bpf_prog), LSM_HOOK_INIT(bpf_map_free_security, selinux_bpf_map_free), LSM_HOOK_INIT(bpf_prog_free_security, selinux_bpf_prog_free), #endif #ifdef CONFIG_PERF_EVENTS LSM_HOOK_INIT(perf_event_open, selinux_perf_event_open), LSM_HOOK_INIT(perf_event_free, selinux_perf_event_free), LSM_HOOK_INIT(perf_event_read, selinux_perf_event_read), LSM_HOOK_INIT(perf_event_write, selinux_perf_event_write), #endif #ifdef CONFIG_IO_URING LSM_HOOK_INIT(uring_override_creds, selinux_uring_override_creds), LSM_HOOK_INIT(uring_sqpoll, selinux_uring_sqpoll), LSM_HOOK_INIT(uring_cmd, selinux_uring_cmd), #endif /* * PUT "CLONING" (ACCESSING + ALLOCATING) HOOKS HERE */ LSM_HOOK_INIT(fs_context_submount, selinux_fs_context_submount), LSM_HOOK_INIT(fs_context_dup, selinux_fs_context_dup), LSM_HOOK_INIT(fs_context_parse_param, selinux_fs_context_parse_param), LSM_HOOK_INIT(sb_eat_lsm_opts, selinux_sb_eat_lsm_opts), #ifdef CONFIG_SECURITY_NETWORK_XFRM LSM_HOOK_INIT(xfrm_policy_clone_security, selinux_xfrm_policy_clone), #endif /* * PUT "ALLOCATING" HOOKS HERE */ LSM_HOOK_INIT(msg_msg_alloc_security, selinux_msg_msg_alloc_security), LSM_HOOK_INIT(msg_queue_alloc_security, selinux_msg_queue_alloc_security), LSM_HOOK_INIT(shm_alloc_security, selinux_shm_alloc_security), LSM_HOOK_INIT(sb_alloc_security, selinux_sb_alloc_security), LSM_HOOK_INIT(inode_alloc_security, selinux_inode_alloc_security), LSM_HOOK_INIT(sem_alloc_security, selinux_sem_alloc_security), LSM_HOOK_INIT(secid_to_secctx, selinux_secid_to_secctx), LSM_HOOK_INIT(inode_getsecctx, selinux_inode_getsecctx), LSM_HOOK_INIT(sk_alloc_security, selinux_sk_alloc_security), LSM_HOOK_INIT(tun_dev_alloc_security, selinux_tun_dev_alloc_security), #ifdef CONFIG_SECURITY_INFINIBAND LSM_HOOK_INIT(ib_alloc_security, selinux_ib_alloc_security), #endif #ifdef CONFIG_SECURITY_NETWORK_XFRM LSM_HOOK_INIT(xfrm_policy_alloc_security, selinux_xfrm_policy_alloc), LSM_HOOK_INIT(xfrm_state_alloc, selinux_xfrm_state_alloc), LSM_HOOK_INIT(xfrm_state_alloc_acquire, selinux_xfrm_state_alloc_acquire), #endif #ifdef CONFIG_KEYS LSM_HOOK_INIT(key_alloc, selinux_key_alloc), #endif #ifdef CONFIG_AUDIT LSM_HOOK_INIT(audit_rule_init, selinux_audit_rule_init), #endif #ifdef CONFIG_BPF_SYSCALL LSM_HOOK_INIT(bpf_map_alloc_security, selinux_bpf_map_alloc), LSM_HOOK_INIT(bpf_prog_alloc_security, selinux_bpf_prog_alloc), #endif #ifdef CONFIG_PERF_EVENTS LSM_HOOK_INIT(perf_event_alloc, selinux_perf_event_alloc), #endif }; static __init int selinux_init(void) { pr_info("SELinux: Initializing.\n"); memset(&selinux_state, 0, sizeof(selinux_state)); enforcing_set(&selinux_state, selinux_enforcing_boot); if (CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE) pr_err("SELinux: CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE is non-zero. This is deprecated and will be rejected in a future kernel release.\n"); checkreqprot_set(&selinux_state, selinux_checkreqprot_boot); selinux_avc_init(&selinux_state.avc); mutex_init(&selinux_state.status_lock); mutex_init(&selinux_state.policy_mutex); /* Set the security state for the initial task. */ cred_init_security(); default_noexec = !(VM_DATA_DEFAULT_FLAGS & VM_EXEC); avc_init(); avtab_cache_init(); ebitmap_cache_init(); hashtab_cache_init(); security_add_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks), "selinux"); if (avc_add_callback(selinux_netcache_avc_callback, AVC_CALLBACK_RESET)) panic("SELinux: Unable to register AVC netcache callback\n"); if (avc_add_callback(selinux_lsm_notifier_avc_callback, AVC_CALLBACK_RESET)) panic("SELinux: Unable to register AVC LSM notifier callback\n"); if (selinux_enforcing_boot) pr_debug("SELinux: Starting in enforcing mode\n"); else pr_debug("SELinux: Starting in permissive mode\n"); fs_validate_description("selinux", selinux_fs_parameters); return 0; } static void delayed_superblock_init(struct super_block *sb, void *unused) { selinux_set_mnt_opts(sb, NULL, 0, NULL); } void selinux_complete_init(void) { pr_debug("SELinux: Completing initialization.\n"); /* Set up any superblocks initialized prior to the policy load. */ pr_debug("SELinux: Setting up existing superblocks.\n"); iterate_supers(delayed_superblock_init, NULL); } /* SELinux requires early initialization in order to label all processes and objects when they are created. */ DEFINE_LSM(selinux) = { .name = "selinux", .flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE, .enabled = &selinux_enabled_boot, .blobs = &selinux_blob_sizes, .init = selinux_init, }; #if defined(CONFIG_NETFILTER) static const struct nf_hook_ops selinux_nf_ops[] = { { .hook = selinux_ip_postroute, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_SELINUX_LAST, }, { .hook = selinux_ip_forward, .pf = NFPROTO_IPV4, .hooknum = NF_INET_FORWARD, .priority = NF_IP_PRI_SELINUX_FIRST, }, { .hook = selinux_ip_output, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_SELINUX_FIRST, }, #if IS_ENABLED(CONFIG_IPV6) { .hook = selinux_ip_postroute, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_SELINUX_LAST, }, { .hook = selinux_ip_forward, .pf = NFPROTO_IPV6, .hooknum = NF_INET_FORWARD, .priority = NF_IP6_PRI_SELINUX_FIRST, }, { .hook = selinux_ip_output, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_SELINUX_FIRST, }, #endif /* IPV6 */ }; static int __net_init selinux_nf_register(struct net *net) { return nf_register_net_hooks(net, selinux_nf_ops, ARRAY_SIZE(selinux_nf_ops)); } static void __net_exit selinux_nf_unregister(struct net *net) { nf_unregister_net_hooks(net, selinux_nf_ops, ARRAY_SIZE(selinux_nf_ops)); } static struct pernet_operations selinux_net_ops = { .init = selinux_nf_register, .exit = selinux_nf_unregister, }; static int __init selinux_nf_ip_init(void) { int err; if (!selinux_enabled_boot) return 0; pr_debug("SELinux: Registering netfilter hooks\n"); err = register_pernet_subsys(&selinux_net_ops); if (err) panic("SELinux: register_pernet_subsys: error %d\n", err); return 0; } __initcall(selinux_nf_ip_init); #ifdef CONFIG_SECURITY_SELINUX_DISABLE static void selinux_nf_ip_exit(void) { pr_debug("SELinux: Unregistering netfilter hooks\n"); unregister_pernet_subsys(&selinux_net_ops); } #endif #else /* CONFIG_NETFILTER */ #ifdef CONFIG_SECURITY_SELINUX_DISABLE #define selinux_nf_ip_exit() #endif #endif /* CONFIG_NETFILTER */ #ifdef CONFIG_SECURITY_SELINUX_DISABLE int selinux_disable(struct selinux_state *state) { if (selinux_initialized(state)) { /* Not permitted after initial policy load. */ return -EINVAL; } if (selinux_disabled(state)) { /* Only do this once. */ return -EINVAL; } selinux_mark_disabled(state); pr_info("SELinux: Disabled at runtime.\n"); /* * Unregister netfilter hooks. * Must be done before security_delete_hooks() to avoid breaking * runtime disable. */ selinux_nf_ip_exit(); security_delete_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks)); /* Try to destroy the avc node cache */ avc_disable(); /* Unregister selinuxfs. */ exit_sel_fs(); return 0; } #endif |
6 1 5 6 3 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | // SPDX-License-Identifier: GPL-2.0-only /* * (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> * * Based on Rusty Russell's IPv4 REDIRECT target. Development of IPv6 * NAT funded by Astaro. */ #include <linux/if.h> #include <linux/inetdevice.h> #include <linux/ip.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/netfilter.h> #include <linux/types.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter/x_tables.h> #include <net/addrconf.h> #include <net/checksum.h> #include <net/protocol.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_redirect.h> static unsigned int redirect_tg6(struct sk_buff *skb, const struct xt_action_param *par) { return nf_nat_redirect_ipv6(skb, par->targinfo, xt_hooknum(par)); } static int redirect_tg6_checkentry(const struct xt_tgchk_param *par) { const struct nf_nat_range2 *range = par->targinfo; if (range->flags & NF_NAT_RANGE_MAP_IPS) return -EINVAL; return nf_ct_netns_get(par->net, par->family); } static void redirect_tg_destroy(const struct xt_tgdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static int redirect_tg4_check(const struct xt_tgchk_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) { pr_debug("bad MAP_IPS.\n"); return -EINVAL; } if (mr->rangesize != 1) { pr_debug("bad rangesize %u.\n", mr->rangesize); return -EINVAL; } return nf_ct_netns_get(par->net, par->family); } static unsigned int redirect_tg4(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; struct nf_nat_range2 range = { .flags = mr->range[0].flags, .min_proto = mr->range[0].min, .max_proto = mr->range[0].max, }; return nf_nat_redirect_ipv4(skb, &range, xt_hooknum(par)); } static struct xt_target redirect_tg_reg[] __read_mostly = { { .name = "REDIRECT", .family = NFPROTO_IPV6, .revision = 0, .table = "nat", .checkentry = redirect_tg6_checkentry, .destroy = redirect_tg_destroy, .target = redirect_tg6, .targetsize = sizeof(struct nf_nat_range), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), .me = THIS_MODULE, }, { .name = "REDIRECT", .family = NFPROTO_IPV4, .revision = 0, .table = "nat", .target = redirect_tg4, .checkentry = redirect_tg4_check, .destroy = redirect_tg_destroy, .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), .me = THIS_MODULE, }, }; static int __init redirect_tg_init(void) { return xt_register_targets(redirect_tg_reg, ARRAY_SIZE(redirect_tg_reg)); } static void __exit redirect_tg_exit(void) { xt_unregister_targets(redirect_tg_reg, ARRAY_SIZE(redirect_tg_reg)); } module_init(redirect_tg_init); module_exit(redirect_tg_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Xtables: Connection redirection to localhost"); MODULE_ALIAS("ip6t_REDIRECT"); MODULE_ALIAS("ipt_REDIRECT"); |
4 4 4 2 2 425 425 425 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 | // SPDX-License-Identifier: GPL-2.0 /* * xfrm4_policy.c * * Changes: * Kazunori MIYAZAWA @USAGI * YOSHIFUJI Hideaki @USAGI * Split up af-specific portion * */ #include <linux/err.h> #include <linux/kernel.h> #include <linux/inetdevice.h> #include <net/dst.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/l3mdev.h> static struct dst_entry *__xfrm4_dst_lookup(struct flowi4 *fl4, const struct xfrm_dst_lookup_params *params) { struct rtable *rt; memset(fl4, 0, sizeof(*fl4)); fl4->daddr = params->daddr->a4; fl4->flowi4_tos = params->tos; fl4->flowi4_l3mdev = l3mdev_master_ifindex_by_index(params->net, params->oif); fl4->flowi4_mark = params->mark; if (params->saddr) fl4->saddr = params->saddr->a4; fl4->flowi4_proto = params->ipproto; fl4->uli = params->uli; rt = __ip_route_output_key(params->net, fl4); if (!IS_ERR(rt)) return &rt->dst; return ERR_CAST(rt); } static struct dst_entry *xfrm4_dst_lookup(const struct xfrm_dst_lookup_params *params) { struct flowi4 fl4; return __xfrm4_dst_lookup(&fl4, params); } static int xfrm4_get_saddr(xfrm_address_t *saddr, const struct xfrm_dst_lookup_params *params) { struct dst_entry *dst; struct flowi4 fl4; dst = __xfrm4_dst_lookup(&fl4, params); if (IS_ERR(dst)) return -EHOSTUNREACH; saddr->a4 = fl4.saddr; dst_release(dst); return 0; } static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { struct rtable *rt = (struct rtable *)xdst->route; const struct flowi4 *fl4 = &fl->u.ip4; xdst->u.rt.rt_iif = fl4->flowi4_iif; xdst->u.dst.dev = dev; netdev_hold(dev, &xdst->u.dst.dev_tracker, GFP_ATOMIC); /* Sheit... I remember I did this right. Apparently, * it was magically lost, so this code needs audit */ xdst->u.rt.rt_is_input = rt->rt_is_input; xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); xdst->u.rt.rt_type = rt->rt_type; xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; xdst->u.rt.rt_gw_family = rt->rt_gw_family; if (rt->rt_gw_family == AF_INET) xdst->u.rt.rt_gw4 = rt->rt_gw4; else if (rt->rt_gw_family == AF_INET6) xdst->u.rt.rt_gw6 = rt->rt_gw6; xdst->u.rt.rt_pmtu = rt->rt_pmtu; xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked; INIT_LIST_HEAD(&xdst->u.rt.rt_uncached); rt_add_uncached_list(&xdst->u.rt); return 0; } static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; path->ops->update_pmtu(path, sk, skb, mtu, confirm_neigh); } static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; path->ops->redirect(path, sk, skb); } static void xfrm4_dst_destroy(struct dst_entry *dst) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; dst_destroy_metrics_generic(dst); if (xdst->u.rt.rt_uncached_list) rt_del_uncached_list(&xdst->u.rt); xfrm_dst_destroy(xdst); } static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, int unregister) { if (!unregister) return; xfrm_dst_ifdown(dst, dev); } static struct dst_ops xfrm4_dst_ops_template = { .family = AF_INET, .update_pmtu = xfrm4_update_pmtu, .redirect = xfrm4_redirect, .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm4_dst_destroy, .ifdown = xfrm4_dst_ifdown, .local_out = __ip_local_out, .gc_thresh = 32768, }; static const struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .dst_ops = &xfrm4_dst_ops_template, .dst_lookup = xfrm4_dst_lookup, .get_saddr = xfrm4_get_saddr, .fill_dst = xfrm4_fill_dst, .blackhole_route = ipv4_blackhole_route, }; #ifdef CONFIG_SYSCTL static struct ctl_table xfrm4_policy_table[] = { { .procname = "xfrm4_gc_thresh", .data = &init_net.xfrm.xfrm4_dst_ops.gc_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { } }; static __net_init int xfrm4_net_sysctl_init(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; table = xfrm4_policy_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(xfrm4_policy_table), GFP_KERNEL); if (!table) goto err_alloc; table[0].data = &net->xfrm.xfrm4_dst_ops.gc_thresh; } hdr = register_net_sysctl(net, "net/ipv4", table); if (!hdr) goto err_reg; net->ipv4.xfrm4_hdr = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static __net_exit void xfrm4_net_sysctl_exit(struct net *net) { struct ctl_table *table; if (!net->ipv4.xfrm4_hdr) return; table = net->ipv4.xfrm4_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.xfrm4_hdr); if (!net_eq(net, &init_net)) kfree(table); } #else /* CONFIG_SYSCTL */ static inline int xfrm4_net_sysctl_init(struct net *net) { return 0; } static inline void xfrm4_net_sysctl_exit(struct net *net) { } #endif static int __net_init xfrm4_net_init(struct net *net) { int ret; memcpy(&net->xfrm.xfrm4_dst_ops, &xfrm4_dst_ops_template, sizeof(xfrm4_dst_ops_template)); ret = dst_entries_init(&net->xfrm.xfrm4_dst_ops); if (ret) return ret; ret = xfrm4_net_sysctl_init(net); if (ret) dst_entries_destroy(&net->xfrm.xfrm4_dst_ops); return ret; } static void __net_exit xfrm4_net_exit(struct net *net) { xfrm4_net_sysctl_exit(net); dst_entries_destroy(&net->xfrm.xfrm4_dst_ops); } static struct pernet_operations __net_initdata xfrm4_net_ops = { .init = xfrm4_net_init, .exit = xfrm4_net_exit, }; static void __init xfrm4_policy_init(void) { xfrm_policy_register_afinfo(&xfrm4_policy_afinfo, AF_INET); } void __init xfrm4_init(void) { xfrm4_state_init(); xfrm4_policy_init(); xfrm4_protocol_init(); register_pernet_subsys(&xfrm4_net_ops); } |
139 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_IO_H #define _ASM_X86_IO_H /* * This file contains the definitions for the x86 IO instructions * inb/inw/inl/outb/outw/outl and the "string versions" of the same * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing" * versions of the single-IO instructions (inb_p/inw_p/..). * * This file is not meant to be obfuscating: it's just complicated * to (a) handle it all in a way that makes gcc able to optimize it * as well as possible and (b) trying to avoid writing the same thing * over and over again with slight variations and possibly making a * mistake somewhere. */ /* * Thanks to James van Artsdalen for a better timing-fix than * the two short jumps: using outb's to a nonexistent port seems * to guarantee better timings even on fast machines. * * On the other hand, I'd like to be sure of a non-existent port: * I feel a bit unsafe about using 0x80 (should be safe, though) * * Linus */ /* * Bit simplified and optimized by Jan Hubicka * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999. * * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added, * isa_read[wl] and isa_write[wl] fixed * - Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ #define ARCH_HAS_IOREMAP_WC #define ARCH_HAS_IOREMAP_WT #include <linux/string.h> #include <linux/compiler.h> #include <linux/cc_platform.h> #include <asm/page.h> #include <asm/early_ioremap.h> #include <asm/pgtable_types.h> #include <asm/shared/io.h> #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ { type ret; asm volatile("mov" size " %1,%0":reg (ret) \ :"m" (*(volatile type __force *)addr) barrier); return ret; } #define build_mmio_write(name, size, type, reg, barrier) \ static inline void name(type val, volatile void __iomem *addr) \ { asm volatile("mov" size " %0,%1": :reg (val), \ "m" (*(volatile type __force *)addr) barrier); } build_mmio_read(readb, "b", unsigned char, "=q", :"memory") build_mmio_read(readw, "w", unsigned short, "=r", :"memory") build_mmio_read(readl, "l", unsigned int, "=r", :"memory") build_mmio_read(__readb, "b", unsigned char, "=q", ) build_mmio_read(__readw, "w", unsigned short, "=r", ) build_mmio_read(__readl, "l", unsigned int, "=r", ) build_mmio_write(writeb, "b", unsigned char, "q", :"memory") build_mmio_write(writew, "w", unsigned short, "r", :"memory") build_mmio_write(writel, "l", unsigned int, "r", :"memory") build_mmio_write(__writeb, "b", unsigned char, "q", ) build_mmio_write(__writew, "w", unsigned short, "r", ) build_mmio_write(__writel, "l", unsigned int, "r", ) #define readb readb #define readw readw #define readl readl #define readb_relaxed(a) __readb(a) #define readw_relaxed(a) __readw(a) #define readl_relaxed(a) __readl(a) #define __raw_readb __readb #define __raw_readw __readw #define __raw_readl __readl #define writeb writeb #define writew writew #define writel writel #define writeb_relaxed(v, a) __writeb(v, a) #define writew_relaxed(v, a) __writew(v, a) #define writel_relaxed(v, a) __writel(v, a) #define __raw_writeb __writeb #define __raw_writew __writew #define __raw_writel __writel #ifdef CONFIG_X86_64 build_mmio_read(readq, "q", u64, "=r", :"memory") build_mmio_read(__readq, "q", u64, "=r", ) build_mmio_write(writeq, "q", u64, "r", :"memory") build_mmio_write(__writeq, "q", u64, "r", ) #define readq_relaxed(a) __readq(a) #define writeq_relaxed(v, a) __writeq(v, a) #define __raw_readq __readq #define __raw_writeq __writeq /* Let people know that we have them */ #define readq readq #define writeq writeq #endif #define ARCH_HAS_VALID_PHYS_ADDR_RANGE extern int valid_phys_addr_range(phys_addr_t addr, size_t size); extern int valid_mmap_phys_addr_range(unsigned long pfn, size_t size); /** * virt_to_phys - map virtual addresses to physical * @address: address to remap * * The returned physical address is the physical (CPU) mapping for * the memory address given. It is only valid to use this function on * addresses directly mapped or allocated via kmalloc. * * This function does not give bus mappings for DMA transfers. In * almost all conceivable cases a device driver should not be using * this function */ static inline phys_addr_t virt_to_phys(volatile void *address) { return __pa(address); } #define virt_to_phys virt_to_phys /** * phys_to_virt - map physical address to virtual * @address: address to remap * * The returned virtual address is a current CPU mapping for * the memory address given. It is only valid to use this function on * addresses that have a kernel mapping * * This function does not handle bus mappings for DMA transfers. In * almost all conceivable cases a device driver should not be using * this function */ static inline void *phys_to_virt(phys_addr_t address) { return __va(address); } #define phys_to_virt phys_to_virt /* * Change "struct page" to physical address. */ #define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) /* * ISA I/O bus memory addresses are 1:1 with the physical address. * However, we truncate the address to unsigned int to avoid undesirable * promotions in legacy drivers. */ static inline unsigned int isa_virt_to_bus(volatile void *address) { return (unsigned int)virt_to_phys(address); } #define isa_bus_to_virt phys_to_virt /* * The default ioremap() behavior is non-cached; if you need something * else, you probably want one of the following. */ extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); #define ioremap_uc ioremap_uc extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); #define ioremap_cache ioremap_cache extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); #define ioremap_prot ioremap_prot extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size); #define ioremap_encrypted ioremap_encrypted /** * ioremap - map bus memory into CPU space * @offset: bus address of the memory * @size: size of the resource to map * * ioremap performs a platform specific sequence of operations to * make bus memory CPU accessible via the readb/readw/readl/writeb/ * writew/writel functions and the other mmio helpers. The returned * address is not guaranteed to be usable directly as a virtual * address. * * If the area you are trying to map is a PCI BAR you should have a * look at pci_iomap(). */ void __iomem *ioremap(resource_size_t offset, unsigned long size); #define ioremap ioremap extern void iounmap(volatile void __iomem *addr); #define iounmap iounmap #ifdef __KERNEL__ void memcpy_fromio(void *, const volatile void __iomem *, size_t); void memcpy_toio(volatile void __iomem *, const void *, size_t); void memset_io(volatile void __iomem *, int, size_t); #define memcpy_fromio memcpy_fromio #define memcpy_toio memcpy_toio #define memset_io memset_io #include <asm-generic/iomap.h> /* * ISA space is 'always mapped' on a typical x86 system, no need to * explicitly ioremap() it. The fact that the ISA IO space is mapped * to PAGE_OFFSET is pure coincidence - it does not mean ISA values * are physical addresses. The following constant pointer can be * used as the IO-area pointer (it can be iounmapped as well, so the * analogy with PCI is quite large): */ #define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET)) #endif /* __KERNEL__ */ extern void native_io_delay(void); extern int io_delay_type; extern void io_delay_init(void); #if defined(CONFIG_PARAVIRT) #include <asm/paravirt.h> #else static inline void slow_down_io(void) { native_io_delay(); #ifdef REALLY_SLOW_IO native_io_delay(); native_io_delay(); native_io_delay(); #endif } #endif #define BUILDIO(bwl, bw, type) \ static inline void out##bwl##_p(type value, u16 port) \ { \ out##bwl(value, port); \ slow_down_io(); \ } \ \ static inline type in##bwl##_p(u16 port) \ { \ type value = in##bwl(port); \ slow_down_io(); \ return value; \ } \ \ static inline void outs##bwl(u16 port, const void *addr, unsigned long count) \ { \ if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \ type *value = (type *)addr; \ while (count) { \ out##bwl(*value, port); \ value++; \ count--; \ } \ } else { \ asm volatile("rep; outs" #bwl \ : "+S"(addr), "+c"(count) \ : "d"(port) : "memory"); \ } \ } \ \ static inline void ins##bwl(u16 port, void *addr, unsigned long count) \ { \ if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \ type *value = (type *)addr; \ while (count) { \ *value = in##bwl(port); \ value++; \ count--; \ } \ } else { \ asm volatile("rep; ins" #bwl \ : "+D"(addr), "+c"(count) \ : "d"(port) : "memory"); \ } \ } BUILDIO(b, b, u8) BUILDIO(w, w, u16) BUILDIO(l, , u32) #undef BUILDIO #define inb_p inb_p #define inw_p inw_p #define inl_p inl_p #define insb insb #define insw insw #define insl insl #define outb_p outb_p #define outw_p outw_p #define outl_p outl_p #define outsb outsb #define outsw outsw #define outsl outsl extern void *xlate_dev_mem_ptr(phys_addr_t phys); extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); #define xlate_dev_mem_ptr xlate_dev_mem_ptr #define unxlate_dev_mem_ptr unxlate_dev_mem_ptr extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, enum page_cache_mode pcm); extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); #define ioremap_wc ioremap_wc extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size); #define ioremap_wt ioremap_wt extern bool is_early_ioremap_ptep(pte_t *ptep); #define IO_SPACE_LIMIT 0xffff #include <asm-generic/io.h> #undef PCI_IOBASE #ifdef CONFIG_MTRR extern int __must_check arch_phys_wc_index(int handle); #define arch_phys_wc_index arch_phys_wc_index extern int __must_check arch_phys_wc_add(unsigned long base, unsigned long size); extern void arch_phys_wc_del(int handle); #define arch_phys_wc_add arch_phys_wc_add #endif #ifdef CONFIG_X86_PAT extern int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size); extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size); #define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc #endif #ifdef CONFIG_AMD_MEM_ENCRYPT extern bool arch_memremap_can_ram_remap(resource_size_t offset, unsigned long size, unsigned long flags); #define arch_memremap_can_ram_remap arch_memremap_can_ram_remap extern bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size); #else static inline bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size) { return true; } #endif /** * iosubmit_cmds512 - copy data to single MMIO location, in 512-bit units * @dst: destination, in MMIO space (must be 512-bit aligned) * @src: source * @count: number of 512 bits quantities to submit * * Submit data from kernel space to MMIO space, in units of 512 bits at a * time. Order of access is not guaranteed, nor is a memory barrier * performed afterwards. * * Warning: Do not use this helper unless your driver has checked that the CPU * instruction is supported on the platform. */ static inline void iosubmit_cmds512(void __iomem *dst, const void *src, size_t count) { const u8 *from = src; const u8 *end = from + count * 64; while (from < end) { movdir64b(dst, from); from += 64; } } #endif /* _ASM_X86_IO_H */ |
121 121 6 28 29 29 35 1 35 2 35 35 35 1 29 35 35 35 35 1 17 35 35 11 17 29 35 35 29 28 27 29 21 3 15 1 1 1 18 1 17 15 6 1 2 1 1 2 3 2 1 1 47 47 2 1 1 85 85 85 84 85 85 84 85 85 84 85 33 4 9 9 1 4 2 3 1 1 1 1 1 1 1 1 9 6 9 2 3 8 22 22 14 8 12 10 4 3 15 6 16 16 12 12 12 12 12 4 12 3 1 10 2 10 22 22 4 4 18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 | // SPDX-License-Identifier: GPL-2.0-only #include "cgroup-internal.h" #include <linux/ctype.h> #include <linux/kmod.h> #include <linux/sort.h> #include <linux/delay.h> #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/delayacct.h> #include <linux/pid_namespace.h> #include <linux/cgroupstats.h> #include <linux/fs_parser.h> #include <trace/events/cgroup.h> #include <trace/hooks/cgroup.h> /* * pidlists linger the following amount before being destroyed. The goal * is avoiding frequent destruction in the middle of consecutive read calls * Expiring in the middle is a performance problem not a correctness one. * 1 sec should be enough. */ #define CGROUP_PIDLIST_DESTROY_DELAY HZ /* Controllers blocked by the commandline in v1 */ static u16 cgroup_no_v1_mask; /* disable named v1 mounts */ static bool cgroup_no_v1_named; /* * pidlist destructions need to be flushed on cgroup destruction. Use a * separate workqueue as flush domain. */ static struct workqueue_struct *cgroup_pidlist_destroy_wq; /* protects cgroup_subsys->release_agent_path */ static DEFINE_SPINLOCK(release_agent_path_lock); bool cgroup1_ssid_disabled(int ssid) { return cgroup_no_v1_mask & (1 << ssid); } /** * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' * @from: attach to all cgroups of a given task * @tsk: the task to be attached * * Return: %0 on success or a negative errno code on failure */ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) { struct cgroup_root *root; int retval = 0; cgroup_lock(); cgroup_attach_lock(true); for_each_root(root) { struct cgroup *from_cgrp; spin_lock_irq(&css_set_lock); from_cgrp = task_cgroup_from_root(from, root); spin_unlock_irq(&css_set_lock); retval = cgroup_attach_task(from_cgrp, tsk, false); if (retval) break; } cgroup_attach_unlock(true); cgroup_unlock(); return retval; } EXPORT_SYMBOL_GPL(cgroup_attach_task_all); /** * cgroup_transfer_tasks - move tasks from one cgroup to another * @to: cgroup to which the tasks will be moved * @from: cgroup in which the tasks currently reside * * Locking rules between cgroup_post_fork() and the migration path * guarantee that, if a task is forking while being migrated, the new child * is guaranteed to be either visible in the source cgroup after the * parent's migration is complete or put into the target cgroup. No task * can slip out of migration through forking. * * Return: %0 on success or a negative errno code on failure */ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { DEFINE_CGROUP_MGCTX(mgctx); struct cgrp_cset_link *link; struct css_task_iter it; struct task_struct *task; int ret; if (cgroup_on_dfl(to)) return -EINVAL; ret = cgroup_migrate_vet_dst(to); if (ret) return ret; cgroup_lock(); cgroup_attach_lock(true); /* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); list_for_each_entry(link, &from->cset_links, cset_link) cgroup_migrate_add_src(link->cset, to, &mgctx); spin_unlock_irq(&css_set_lock); ret = cgroup_migrate_prepare_dst(&mgctx); if (ret) goto out_err; /* * Migrate tasks one-by-one until @from is empty. This fails iff * ->can_attach() fails. */ do { css_task_iter_start(&from->self, 0, &it); do { task = css_task_iter_next(&it); } while (task && (task->flags & PF_EXITING)); if (task) get_task_struct(task); css_task_iter_end(&it); if (task) { ret = cgroup_migrate(task, false, &mgctx); if (!ret) TRACE_CGROUP_PATH(transfer_tasks, to, task, false); put_task_struct(task); } } while (task && !ret); out_err: cgroup_migrate_finish(&mgctx); cgroup_attach_unlock(true); cgroup_unlock(); return ret; } /* * Stuff for reading the 'tasks'/'procs' files. * * Reading this file can return large amounts of data if a cgroup has * *lots* of attached tasks. So it may need several calls to read(), * but we cannot guarantee that the information we produce is correct * unless we produce it entirely atomically. * */ /* which pidlist file are we talking about? */ enum cgroup_filetype { CGROUP_FILE_PROCS, CGROUP_FILE_TASKS, }; /* * A pidlist is a list of pids that virtually represents the contents of one * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, * a pair (one each for procs, tasks) for each pid namespace that's relevant * to the cgroup. */ struct cgroup_pidlist { /* * used to find which pidlist is wanted. doesn't change as long as * this particular list stays in the list. */ struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; /* array of xids */ pid_t *list; /* how many elements the above list has */ int length; /* each of these stored in a list by its cgroup */ struct list_head links; /* pointer to the cgroup we belong to, for list removal purposes */ struct cgroup *owner; /* for delayed destruction */ struct delayed_work destroy_dwork; }; /* * Used to destroy all pidlists lingering waiting for destroy timer. None * should be left afterwards. */ void cgroup1_pidlist_destroy_all(struct cgroup *cgrp) { struct cgroup_pidlist *l, *tmp_l; mutex_lock(&cgrp->pidlist_mutex); list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); mutex_unlock(&cgrp->pidlist_mutex); flush_workqueue(cgroup_pidlist_destroy_wq); BUG_ON(!list_empty(&cgrp->pidlists)); } static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, destroy_dwork); struct cgroup_pidlist *tofree = NULL; mutex_lock(&l->owner->pidlist_mutex); /* * Destroy iff we didn't get queued again. The state won't change * as destroy_dwork can only be queued while locked. */ if (!delayed_work_pending(dwork)) { list_del(&l->links); kvfree(l->list); put_pid_ns(l->key.ns); tofree = l; } mutex_unlock(&l->owner->pidlist_mutex); kfree(tofree); } /* * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries * Returns the number of unique elements. */ static int pidlist_uniq(pid_t *list, int length) { int src, dest = 1; /* * we presume the 0th element is unique, so i starts at 1. trivial * edge cases first; no work needs to be done for either */ if (length == 0 || length == 1) return length; /* src and dest walk down the list; dest counts unique elements */ for (src = 1; src < length; src++) { /* find next unique element */ while (list[src] == list[src-1]) { src++; if (src == length) goto after; } /* dest always points to where the next unique element goes */ list[dest] = list[src]; dest++; } after: return dest; } /* * The two pid files - task and cgroup.procs - guaranteed that the result * is sorted, which forced this whole pidlist fiasco. As pid order is * different per namespace, each namespace needs differently sorted list, * making it impossible to use, for example, single rbtree of member tasks * sorted by task pointer. As pidlists can be fairly large, allocating one * per open file is dangerous, so cgroup had to implement shared pool of * pidlists keyed by cgroup and namespace. */ static int cmppid(const void *a, const void *b) { return *(pid_t *)a - *(pid_t *)b; } static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, enum cgroup_filetype type) { struct cgroup_pidlist *l; /* don't need task_nsproxy() if we're looking at ourself */ struct pid_namespace *ns = task_active_pid_ns(current); lockdep_assert_held(&cgrp->pidlist_mutex); list_for_each_entry(l, &cgrp->pidlists, links) if (l->key.type == type && l->key.ns == ns) return l; return NULL; } /* * find the appropriate pidlist for our purpose (given procs vs tasks) * returns with the lock on that pidlist already held, and takes care * of the use count, or returns NULL with no locks held if we're out of * memory. */ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, enum cgroup_filetype type) { struct cgroup_pidlist *l; lockdep_assert_held(&cgrp->pidlist_mutex); l = cgroup_pidlist_find(cgrp, type); if (l) return l; /* entry not found; create a new one */ l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); if (!l) return l; INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); l->key.type = type; /* don't need task_nsproxy() if we're looking at ourself */ l->key.ns = get_pid_ns(task_active_pid_ns(current)); l->owner = cgrp; list_add(&l->links, &cgrp->pidlists); return l; } /* * Load a cgroup's pidarray with either procs' tgids or tasks' pids */ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, struct cgroup_pidlist **lp) { pid_t *array; int length; int pid, n = 0; /* used for populating the array */ struct css_task_iter it; struct task_struct *tsk; struct cgroup_pidlist *l; lockdep_assert_held(&cgrp->pidlist_mutex); /* * If cgroup gets more users after we read count, we won't have * enough space - tough. This race is indistinguishable to the * caller from the case that the additional cgroup users didn't * show up until sometime later on. */ length = cgroup_task_count(cgrp); array = kvmalloc_array(length, sizeof(pid_t), GFP_KERNEL); if (!array) return -ENOMEM; /* now, populate the array */ css_task_iter_start(&cgrp->self, 0, &it); while ((tsk = css_task_iter_next(&it))) { if (unlikely(n == length)) break; /* get tgid or pid for procs or tasks file respectively */ if (type == CGROUP_FILE_PROCS) pid = task_tgid_vnr(tsk); else pid = task_pid_vnr(tsk); if (pid > 0) /* make sure to only use valid results */ array[n++] = pid; } css_task_iter_end(&it); length = n; /* now sort & strip out duplicates (tgids or recycled thread PIDs) */ sort(array, length, sizeof(pid_t), cmppid, NULL); length = pidlist_uniq(array, length); l = cgroup_pidlist_find_create(cgrp, type); if (!l) { kvfree(array); return -ENOMEM; } /* store array, freeing old if necessary */ kvfree(l->list); l->list = array; l->length = length; *lp = l; return 0; } /* * seq_file methods for the tasks/procs files. The seq_file position is the * next pid to display; the seq_file iterator is a pointer to the pid * in the cgroup->l->list array. */ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) { /* * Initially we receive a position value that corresponds to * one more than the last pid shown (or 0 on the first call or * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ struct kernfs_open_file *of = s->private; struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = seq_css(s)->cgroup; struct cgroup_pidlist *l; enum cgroup_filetype type = seq_cft(s)->private; int index = 0, pid = *pos; int *iter, ret; mutex_lock(&cgrp->pidlist_mutex); /* * !NULL @ctx->procs1.pidlist indicates that this isn't the first * start() after open. If the matching pidlist is around, we can use * that. Look for it. Note that @ctx->procs1.pidlist can't be used * directly. It could already have been destroyed. */ if (ctx->procs1.pidlist) ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type); /* * Either this is the first start() after open or the matching * pidlist has been destroyed inbetween. Create a new one. */ if (!ctx->procs1.pidlist) { ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist); if (ret) return ERR_PTR(ret); } l = ctx->procs1.pidlist; if (pid) { int end = l->length; while (index < end) { int mid = (index + end) / 2; if (l->list[mid] == pid) { index = mid; break; } else if (l->list[mid] <= pid) index = mid + 1; else end = mid; } } /* If we're off the end of the array, we're done */ if (index >= l->length) return NULL; /* Update the abstract position to be the actual pid that we found */ iter = l->list + index; *pos = *iter; return iter; } static void cgroup_pidlist_stop(struct seq_file *s, void *v) { struct kernfs_open_file *of = s->private; struct cgroup_file_ctx *ctx = of->priv; struct cgroup_pidlist *l = ctx->procs1.pidlist; if (l) mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, CGROUP_PIDLIST_DESTROY_DELAY); mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); } static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; struct cgroup_file_ctx *ctx = of->priv; struct cgroup_pidlist *l = ctx->procs1.pidlist; pid_t *p = v; pid_t *end = l->list + l->length; /* * Advance to the next pid in the array. If this goes off the * end, we're done */ p++; if (p >= end) { (*pos)++; return NULL; } else { *pos = *p; return p; } } static int cgroup_pidlist_show(struct seq_file *s, void *v) { seq_printf(s, "%d\n", *(int *)v); return 0; } static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool threadgroup) { struct cgroup *cgrp; struct task_struct *task; const struct cred *cred, *tcred; ssize_t ret; bool locked; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; task = cgroup_procs_write_start(buf, threadgroup, &locked, cgrp); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; /* * Even if we're attaching all tasks in the thread group, we only need * to check permissions on one of them. Check permissions using the * credentials from file open to protect against inherited fd attacks. */ cred = of->file->f_cred; tcred = get_task_cred(task); if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && !uid_eq(cred->euid, tcred->suid) && !ns_capable(tcred->user_ns, CAP_SYS_NICE)) ret = -EACCES; put_cred(tcred); if (ret) goto out_finish; ret = cgroup_attach_task(cgrp, task, threadgroup); trace_android_vh_cgroup_set_task(ret, task); out_finish: cgroup_procs_write_finish(task, locked); out_unlock: cgroup_kn_unlock(of->kn); return ret ?: nbytes; } static ssize_t cgroup1_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return __cgroup1_procs_write(of, buf, nbytes, off, true); } static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return __cgroup1_procs_write(of, buf, nbytes, off, false); } static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; struct cgroup_file_ctx *ctx; BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); /* * Release agent gets called with all capabilities, * require capabilities to set release agent. */ ctx = of->priv; if ((ctx->ns->user_ns != &init_user_ns) || !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN)) return -EPERM; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; spin_lock(&release_agent_path_lock); strlcpy(cgrp->root->release_agent_path, strstrip(buf), sizeof(cgrp->root->release_agent_path)); spin_unlock(&release_agent_path_lock); cgroup_kn_unlock(of->kn); return nbytes; } static int cgroup_release_agent_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; spin_lock(&release_agent_path_lock); seq_puts(seq, cgrp->root->release_agent_path); spin_unlock(&release_agent_path_lock); seq_putc(seq, '\n'); return 0; } static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) { seq_puts(seq, "0\n"); return 0; } static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, struct cftype *cft) { return notify_on_release(css->cgroup); } static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { if (val) set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); else clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); return 0; } static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, struct cftype *cft) { return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); } static int cgroup_clone_children_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { if (val) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); else clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); return 0; } /* cgroup core interface files for the legacy hierarchies */ struct cftype cgroup1_base_files[] = { { .name = "cgroup.procs", .seq_start = cgroup_pidlist_start, .seq_next = cgroup_pidlist_next, .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_PROCS, .write = cgroup1_procs_write, }, { .name = "cgroup.clone_children", .read_u64 = cgroup_clone_children_read, .write_u64 = cgroup_clone_children_write, }, { .name = "cgroup.sane_behavior", .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_sane_behavior_show, }, { .name = "tasks", .seq_start = cgroup_pidlist_start, .seq_next = cgroup_pidlist_next, .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_TASKS, .write = cgroup1_tasks_write, }, { .name = "notify_on_release", .read_u64 = cgroup_read_notify_on_release, .write_u64 = cgroup_write_notify_on_release, }, { .name = "release_agent", .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_release_agent_show, .write = cgroup_release_agent_write, .max_write_len = PATH_MAX - 1, }, { } /* terminate */ }; /* Display information about each subsystem and each hierarchy */ int proc_cgroupstats_show(struct seq_file *m, void *v) { struct cgroup_subsys *ss; int i; seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); /* * Grab the subsystems state racily. No need to add avenue to * cgroup_mutex contention. */ for_each_subsys(ss, i) seq_printf(m, "%s\t%d\t%d\t%d\n", ss->legacy_name, ss->root->hierarchy_id, atomic_read(&ss->root->nr_cgrps), cgroup_ssid_enabled(i)); return 0; } /** * cgroupstats_build - build and fill cgroupstats * @stats: cgroupstats to fill information into * @dentry: A dentry entry belonging to the cgroup for which stats have * been requested. * * Build and fill cgroupstats so that taskstats can export it to user * space. * * Return: %0 on success or a negative errno code on failure */ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { struct kernfs_node *kn = kernfs_node_from_dentry(dentry); struct cgroup *cgrp; struct css_task_iter it; struct task_struct *tsk; /* it should be kernfs_node belonging to cgroupfs and is a directory */ if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || kernfs_type(kn) != KERNFS_DIR) return -EINVAL; /* * We aren't being called from kernfs and there's no guarantee on * @kn->priv's validity. For this and css_tryget_online_from_dir(), * @kn->priv is RCU safe. Let's do the RCU dancing. */ rcu_read_lock(); cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); if (!cgrp || !cgroup_tryget(cgrp)) { rcu_read_unlock(); return -ENOENT; } rcu_read_unlock(); css_task_iter_start(&cgrp->self, 0, &it); while ((tsk = css_task_iter_next(&it))) { switch (READ_ONCE(tsk->__state)) { case TASK_RUNNING: stats->nr_running++; break; case TASK_INTERRUPTIBLE: stats->nr_sleeping++; break; case TASK_UNINTERRUPTIBLE: stats->nr_uninterruptible++; break; case TASK_STOPPED: stats->nr_stopped++; break; default: if (tsk->in_iowait) stats->nr_io_wait++; break; } } css_task_iter_end(&it); cgroup_put(cgrp); return 0; } void cgroup1_check_for_release(struct cgroup *cgrp) { if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) schedule_work(&cgrp->release_agent_work); } /* * Notify userspace when a cgroup is released, by running the * configured release agent with the name of the cgroup (path * relative to the root of cgroup file system) as the argument. * * Most likely, this user command will try to rmdir this cgroup. * * This races with the possibility that some other task will be * attached to this cgroup before it is removed, or that some other * user task will 'mkdir' a child cgroup of this cgroup. That's ok. * The presumed 'rmdir' will fail quietly if this cgroup is no longer * unused, and this cgroup will be reprieved from its death sentence, * to continue to serve a useful existence. Next time it's released, * we will get notified again, if it still has 'notify_on_release' set. * * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which * means only wait until the task is successfully execve()'d. The * separate release agent task is forked by call_usermodehelper(), * then control in this thread returns here, without waiting for the * release agent task. We don't bother to wait because the caller of * this routine has no use for the exit status of the release agent * task, so no sense holding our caller up for that. */ void cgroup1_release_agent(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, release_agent_work); char *pathbuf, *agentbuf; char *argv[3], *envp[3]; int ret; /* snoop agent path and exit early if empty */ if (!cgrp->root->release_agent_path[0]) return; /* prepare argument buffers */ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); agentbuf = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathbuf || !agentbuf) goto out_free; spin_lock(&release_agent_path_lock); strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX); spin_unlock(&release_agent_path_lock); if (!agentbuf[0]) goto out_free; ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); if (ret < 0) goto out_free; argv[0] = agentbuf; argv[1] = pathbuf; argv[2] = NULL; /* minimal command environment */ envp[0] = "HOME=/"; envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[2] = NULL; call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); out_free: kfree(agentbuf); kfree(pathbuf); } /* * cgroup_rename - Only allow simple rename of directories in place. */ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name_str) { struct cgroup *cgrp = kn->priv; int ret; /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ if (strchr(new_name_str, '\n')) return -EINVAL; if (kernfs_type(kn) != KERNFS_DIR) return -ENOTDIR; if (kn->parent != new_parent) return -EIO; /* * We're gonna grab cgroup_mutex which nests outside kernfs * active_ref. kernfs_rename() doesn't require active_ref * protection. Break them before grabbing cgroup_mutex. */ kernfs_break_active_protection(new_parent); kernfs_break_active_protection(kn); cgroup_lock(); ret = kernfs_rename(kn, new_parent, new_name_str); if (!ret) TRACE_CGROUP_PATH(rename, cgrp); cgroup_unlock(); kernfs_unbreak_active_protection(kn); kernfs_unbreak_active_protection(new_parent); return ret; } static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root) { struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_subsys *ss; int ssid; for_each_subsys(ss, ssid) if (root->subsys_mask & (1 << ssid)) seq_show_option(seq, ss->legacy_name, NULL); if (root->flags & CGRP_ROOT_NOPREFIX) seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) seq_puts(seq, ",xattr"); if (root->flags & CGRP_ROOT_CPUSET_V2_MODE) seq_puts(seq, ",cpuset_v2_mode"); if (root->flags & CGRP_ROOT_FAVOR_DYNMODS) seq_puts(seq, ",favordynmods"); spin_lock(&release_agent_path_lock); if (strlen(root->release_agent_path)) seq_show_option(seq, "release_agent", root->release_agent_path); spin_unlock(&release_agent_path_lock); if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) seq_puts(seq, ",clone_children"); if (strlen(root->name)) seq_show_option(seq, "name", root->name); return 0; } enum cgroup1_param { Opt_all, Opt_clone_children, Opt_cpuset_v2_mode, Opt_name, Opt_none, Opt_noprefix, Opt_release_agent, Opt_xattr, Opt_favordynmods, Opt_nofavordynmods, }; const struct fs_parameter_spec cgroup1_fs_parameters[] = { fsparam_flag ("all", Opt_all), fsparam_flag ("clone_children", Opt_clone_children), fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode), fsparam_string("name", Opt_name), fsparam_flag ("none", Opt_none), fsparam_flag ("noprefix", Opt_noprefix), fsparam_string("release_agent", Opt_release_agent), fsparam_flag ("xattr", Opt_xattr), fsparam_flag ("favordynmods", Opt_favordynmods), fsparam_flag ("nofavordynmods", Opt_nofavordynmods), {} }; int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct cgroup_subsys *ss; struct fs_parse_result result; int opt, i; opt = fs_parse(fc, cgroup1_fs_parameters, param, &result); if (opt == -ENOPARAM) { int ret; ret = vfs_parse_fs_param_source(fc, param); if (ret != -ENOPARAM) return ret; for_each_subsys(ss, i) { if (strcmp(param->key, ss->legacy_name)) continue; if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i)) return invalfc(fc, "Disabled controller '%s'", param->key); ctx->subsys_mask |= (1 << i); return 0; } return invalfc(fc, "Unknown subsys name '%s'", param->key); } if (opt < 0) return opt; switch (opt) { case Opt_none: /* Explicitly have no subsystems */ ctx->none = true; break; case Opt_all: ctx->all_ss = true; break; case Opt_noprefix: ctx->flags |= CGRP_ROOT_NOPREFIX; break; case Opt_clone_children: ctx->cpuset_clone_children = true; break; case Opt_cpuset_v2_mode: ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE; break; case Opt_xattr: ctx->flags |= CGRP_ROOT_XATTR; break; case Opt_favordynmods: ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; break; case Opt_nofavordynmods: ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; break; case Opt_release_agent: /* Specifying two release agents is forbidden */ if (ctx->release_agent) return invalfc(fc, "release_agent respecified"); /* * Release agent gets called with all capabilities, * require capabilities to set release agent. */ if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) return invalfc(fc, "Setting release_agent not allowed"); ctx->release_agent = param->string; param->string = NULL; break; case Opt_name: /* blocked by boot param? */ if (cgroup_no_v1_named) return -ENOENT; /* Can't specify an empty name */ if (!param->size) return invalfc(fc, "Empty name"); if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1) return invalfc(fc, "Name too long"); /* Must match [\w.-]+ */ for (i = 0; i < param->size; i++) { char c = param->string[i]; if (isalnum(c)) continue; if ((c == '.') || (c == '-') || (c == '_')) continue; return invalfc(fc, "Invalid name"); } /* Specifying two names is forbidden */ if (ctx->name) return invalfc(fc, "name respecified"); ctx->name = param->string; param->string = NULL; break; } return 0; } static int check_cgroupfs_options(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); u16 mask = U16_MAX; u16 enabled = 0; struct cgroup_subsys *ss; int i; #ifdef CONFIG_CPUSETS mask = ~((u16)1 << cpuset_cgrp_id); #endif for_each_subsys(ss, i) if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) enabled |= 1 << i; ctx->subsys_mask &= enabled; /* * In absence of 'none', 'name=' and subsystem name options, * let's default to 'all'. */ if (!ctx->subsys_mask && !ctx->none && !ctx->name) ctx->all_ss = true; if (ctx->all_ss) { /* Mutually exclusive option 'all' + subsystem name */ if (ctx->subsys_mask) return invalfc(fc, "subsys name conflicts with all"); /* 'all' => select all the subsystems */ ctx->subsys_mask = enabled; } /* * We either have to specify by name or by subsystems. (So all * empty hierarchies must have a name). */ if (!ctx->subsys_mask && !ctx->name) return invalfc(fc, "Need name or subsystem set"); /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just * the cpuset subsystem. */ if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask)) return invalfc(fc, "noprefix used incorrectly"); /* Can't specify "none" and some subsystems */ if (ctx->subsys_mask && ctx->none) return invalfc(fc, "none used incorrectly"); return 0; } int cgroup1_reconfigure(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); int ret = 0; u16 added_mask, removed_mask; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* See what subsystems are wanted */ ret = check_cgroupfs_options(fc); if (ret) goto out_unlock; if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent) pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); added_mask = ctx->subsys_mask & ~root->subsys_mask; removed_mask = root->subsys_mask & ~ctx->subsys_mask; /* Don't allow flags or name to change at remount */ if ((ctx->flags ^ root->flags) || (ctx->name && strcmp(ctx->name, root->name))) { errorfc(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"", ctx->flags, ctx->name ?: "", root->flags, root->name); ret = -EINVAL; goto out_unlock; } /* remounting is not allowed for populated hierarchies */ if (!list_empty(&root->cgrp.self.children)) { ret = -EBUSY; goto out_unlock; } ret = rebind_subsystems(root, added_mask); if (ret) goto out_unlock; WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); if (ctx->release_agent) { spin_lock(&release_agent_path_lock); strcpy(root->release_agent_path, ctx->release_agent); spin_unlock(&release_agent_path_lock); } trace_cgroup_remount(root); out_unlock: cgroup_unlock(); return ret; } struct kernfs_syscall_ops cgroup1_kf_syscall_ops = { .rename = cgroup1_rename, .show_options = cgroup1_show_options, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .show_path = cgroup_show_path, }; /* * The guts of cgroup1 mount - find or create cgroup_root to use. * Called with cgroup_mutex held; returns 0 on success, -E... on * error and positive - in case when the candidate is busy dying. * On success it stashes a reference to cgroup_root into given * cgroup_fs_context; that reference is *NOT* counting towards the * cgroup_root refcount. */ static int cgroup1_root_to_use(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct cgroup_root *root; struct cgroup_subsys *ss; int i, ret; /* First find the desired set of subsystems */ ret = check_cgroupfs_options(fc); if (ret) return ret; /* * Destruction of cgroup root is asynchronous, so subsystems may * still be dying after the previous unmount. Let's drain the * dying subsystems. We just need to ensure that the ones * unmounted previously finish dying and don't care about new ones * starting. Testing ref liveliness is good enough. */ for_each_subsys(ss, i) { if (!(ctx->subsys_mask & (1 << i)) || ss->root == &cgrp_dfl_root) continue; if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) return 1; /* restart */ cgroup_put(&ss->root->cgrp); } for_each_root(root) { bool name_match = false; if (root == &cgrp_dfl_root) continue; /* * If we asked for a name then it must match. Also, if * name matches but sybsys_mask doesn't, we should fail. * Remember whether name matched. */ if (ctx->name) { if (strcmp(ctx->name, root->name)) continue; name_match = true; } /* * If we asked for subsystems (or explicitly for no * subsystems) then they must match. */ if ((ctx->subsys_mask || ctx->none) && (ctx->subsys_mask != root->subsys_mask)) { if (!name_match) continue; return -EBUSY; } if (root->flags ^ ctx->flags) pr_warn("new mount options do not match the existing superblock, will be ignored\n"); ctx->root = root; return 0; } /* * No such thing, create a new one. name= matching without subsys * specification is allowed for already existing hierarchies but we * can't create new one without subsys specification. */ if (!ctx->subsys_mask && !ctx->none) return invalfc(fc, "No subsys list or none specified"); /* Hierarchies may only be created in the initial cgroup namespace. */ if (ctx->ns != &init_cgroup_ns) return -EPERM; root = kzalloc(sizeof(*root), GFP_KERNEL); if (!root) return -ENOMEM; ctx->root = root; init_cgroup_root(ctx); ret = cgroup_setup_root(root, ctx->subsys_mask); if (!ret) cgroup_favor_dynmods(root, ctx->flags & CGRP_ROOT_FAVOR_DYNMODS); else cgroup_free_root(root); return ret; } int cgroup1_get_tree(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); int ret; /* Check if the caller has permission to mount. */ if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); ret = cgroup1_root_to_use(fc); if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt)) ret = 1; /* restart */ cgroup_unlock(); if (!ret) ret = cgroup_do_get_tree(fc); if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) { fc_drop_locked(fc); ret = 1; } if (unlikely(ret > 0)) { msleep(10); return restart_syscall(); } return ret; } static int __init cgroup1_wq_init(void) { /* * Used to destroy pidlists and separate to serve as flush domain. * Cap @max_active to 1 too. */ cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", 0, 1); BUG_ON(!cgroup_pidlist_destroy_wq); return 0; } core_initcall(cgroup1_wq_init); static int __init cgroup_no_v1(char *str) { struct cgroup_subsys *ss; char *token; int i; while ((token = strsep(&str, ",")) != NULL) { if (!*token) continue; if (!strcmp(token, "all")) { cgroup_no_v1_mask = U16_MAX; continue; } if (!strcmp(token, "named")) { cgroup_no_v1_named = true; continue; } for_each_subsys(ss, i) { if (strcmp(token, ss->name) && strcmp(token, ss->legacy_name)) continue; cgroup_no_v1_mask |= 1 << i; } } return 1; } __setup("cgroup_no_v1=", cgroup_no_v1); |
18 5 5 5 5 4 4 4 3 2 4 4 4 4 4 2 3 5 5 1 5 5 5 5 5 1 5 5 5 5 5 5 5 5 5 5 13 13 13 1 1 11 1 1 1 4 3 1 4 3 15 15 1 1 6 5 1 5 5 5 1 5 10 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 | // SPDX-License-Identifier: LGPL-2.1 /* * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd. * Written by Takashi Sato <t-sato@yk.jp.nec.com> * Akira Fujita <a-fujita@rs.jp.nec.com> */ #include <linux/fs.h> #include <linux/quotaops.h> #include <linux/slab.h> #include <linux/sched/mm.h> #include "ext4_jbd2.h" #include "ext4.h" #include "ext4_extents.h" /** * get_ext_path() - Find an extent path for designated logical block number. * @inode: inode to be searched * @lblock: logical block number to find an extent path * @ppath: pointer to an extent path pointer (for output) * * ext4_find_extent wrapper. Return 0 on success, or a negative error value * on failure. */ static inline int get_ext_path(struct inode *inode, ext4_lblk_t lblock, struct ext4_ext_path **ppath) { struct ext4_ext_path *path; path = ext4_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); if (path[ext_depth(inode)].p_ext == NULL) { ext4_free_ext_path(path); *ppath = NULL; return -ENODATA; } return 0; } /** * ext4_double_down_write_data_sem() - write lock two inodes's i_data_sem * @first: inode to be locked * @second: inode to be locked * * Acquire write lock of i_data_sem of the two inodes */ void ext4_double_down_write_data_sem(struct inode *first, struct inode *second) { if (first < second) { down_write(&EXT4_I(first)->i_data_sem); down_write_nested(&EXT4_I(second)->i_data_sem, I_DATA_SEM_OTHER); } else { down_write(&EXT4_I(second)->i_data_sem); down_write_nested(&EXT4_I(first)->i_data_sem, I_DATA_SEM_OTHER); } } /** * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem * * @orig_inode: original inode structure to be released its lock first * @donor_inode: donor inode structure to be released its lock second * Release write lock of i_data_sem of two inodes (orig and donor). */ void ext4_double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) { up_write(&EXT4_I(orig_inode)->i_data_sem); up_write(&EXT4_I(donor_inode)->i_data_sem); } /** * mext_check_coverage - Check that all extents in range has the same type * * @inode: inode in question * @from: block offset of inode * @count: block count to be checked * @unwritten: extents expected to be unwritten * @err: pointer to save error value * * Return 1 if all extents in range has expected type, and zero otherwise. */ static int mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, int unwritten, int *err) { struct ext4_ext_path *path = NULL; struct ext4_extent *ext; int ret = 0; ext4_lblk_t last = from + count; while (from < last) { *err = get_ext_path(inode, from, &path); if (*err) goto out; ext = path[ext_depth(inode)].p_ext; if (unwritten != ext4_ext_is_unwritten(ext)) goto out; from += ext4_ext_get_actual_len(ext); } ret = 1; out: ext4_free_ext_path(path); return ret; } /** * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2 * * @inode1: the inode structure * @inode2: the inode structure * @index1: page index * @index2: page index * @page: result page vector * * Grab two locked pages for inode's by inode order */ static int mext_page_double_lock(struct inode *inode1, struct inode *inode2, pgoff_t index1, pgoff_t index2, struct page *page[2]) { struct address_space *mapping[2]; unsigned int flags; BUG_ON(!inode1 || !inode2); if (inode1 < inode2) { mapping[0] = inode1->i_mapping; mapping[1] = inode2->i_mapping; } else { swap(index1, index2); mapping[0] = inode2->i_mapping; mapping[1] = inode1->i_mapping; } flags = memalloc_nofs_save(); page[0] = grab_cache_page_write_begin(mapping[0], index1); if (!page[0]) { memalloc_nofs_restore(flags); return -ENOMEM; } page[1] = grab_cache_page_write_begin(mapping[1], index2); memalloc_nofs_restore(flags); if (!page[1]) { unlock_page(page[0]); put_page(page[0]); return -ENOMEM; } /* * grab_cache_page_write_begin() may not wait on page's writeback if * BDI not demand that. But it is reasonable to be very conservative * here and explicitly wait on page's writeback */ wait_on_page_writeback(page[0]); wait_on_page_writeback(page[1]); if (inode1 > inode2) swap(page[0], page[1]); return 0; } /* Force page buffers uptodate w/o dropping page's lock */ static int mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) { struct inode *inode = page->mapping->host; sector_t block; struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; unsigned int blocksize, block_start, block_end; int i, err, nr = 0, partial = 0; BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); if (PageUptodate(page)) return 0; blocksize = i_blocksize(inode); if (!page_has_buffers(page)) create_empty_buffers(page, blocksize, 0); head = page_buffers(page); block = (sector_t)page->index << (PAGE_SHIFT - inode->i_blkbits); for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start = block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (!buffer_uptodate(bh)) partial = 1; continue; } if (buffer_uptodate(bh)) continue; if (!buffer_mapped(bh)) { err = ext4_get_block(inode, block, bh, 0); if (err) { SetPageError(page); return err; } if (!buffer_mapped(bh)) { zero_user(page, block_start, blocksize); set_buffer_uptodate(bh); continue; } } BUG_ON(nr >= MAX_BUF_PER_PAGE); arr[nr++] = bh; } /* No io required */ if (!nr) goto out; for (i = 0; i < nr; i++) { bh = arr[i]; if (!bh_uptodate_or_lock(bh)) { err = ext4_read_bh(bh, 0, NULL); if (err) return err; } } out: if (!partial) SetPageUptodate(page); return 0; } /** * move_extent_per_page - Move extent data per page * * @o_filp: file structure of original file * @donor_inode: donor inode * @orig_page_offset: page index on original file * @donor_page_offset: page index on donor file * @data_offset_in_page: block index where data swapping starts * @block_len_in_page: the number of blocks to be swapped * @unwritten: orig extent is unwritten or not * @err: pointer to save return value * * Save the data in original inode blocks and replace original inode extents * with donor inode extents by calling ext4_swap_extents(). * Finally, write out the saved data in new original inode blocks. Return * replaced block count. */ static int move_extent_per_page(struct file *o_filp, struct inode *donor_inode, pgoff_t orig_page_offset, pgoff_t donor_page_offset, int data_offset_in_page, int block_len_in_page, int unwritten, int *err) { struct inode *orig_inode = file_inode(o_filp); struct page *pagep[2] = {NULL, NULL}; struct folio *folio[2] = {NULL, NULL}; handle_t *handle; ext4_lblk_t orig_blk_offset, donor_blk_offset; unsigned long blocksize = orig_inode->i_sb->s_blocksize; unsigned int tmp_data_size, data_size, replaced_size; int i, err2, jblocks, retries = 0; int replaced_count = 0; int from = data_offset_in_page << orig_inode->i_blkbits; int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; struct super_block *sb = orig_inode->i_sb; struct buffer_head *bh = NULL; /* * It needs twice the amount of ordinary journal buffers because * inode and donor_inode may change each different metadata blocks. */ again: *err = 0; jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks); if (IS_ERR(handle)) { *err = PTR_ERR(handle); return 0; } orig_blk_offset = orig_page_offset * blocks_per_page + data_offset_in_page; donor_blk_offset = donor_page_offset * blocks_per_page + data_offset_in_page; /* Calculate data_size */ if ((orig_blk_offset + block_len_in_page - 1) == ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { /* Replace the last block */ tmp_data_size = orig_inode->i_size & (blocksize - 1); /* * If data_size equal zero, it shows data_size is multiples of * blocksize. So we set appropriate value. */ if (tmp_data_size == 0) tmp_data_size = blocksize; data_size = tmp_data_size + ((block_len_in_page - 1) << orig_inode->i_blkbits); } else data_size = block_len_in_page << orig_inode->i_blkbits; replaced_size = data_size; *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset, donor_page_offset, pagep); if (unlikely(*err < 0)) goto stop_journal; /* * If orig extent was unwritten it can become initialized * at any time after i_data_sem was dropped, in order to * serialize with delalloc we have recheck extent while we * hold page's lock, if it is still the case data copy is not * necessary, just swap data blocks between orig and donor. */ folio[0] = page_folio(pagep[0]); folio[1] = page_folio(pagep[1]); VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]); VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]); VM_BUG_ON_FOLIO(folio_nr_pages(folio[0]) != folio_nr_pages(folio[1]), folio[1]); if (unwritten) { ext4_double_down_write_data_sem(orig_inode, donor_inode); /* If any of extents in range became initialized we have to * fallback to data copying */ unwritten = mext_check_coverage(orig_inode, orig_blk_offset, block_len_in_page, 1, err); if (*err) goto drop_data_sem; unwritten &= mext_check_coverage(donor_inode, donor_blk_offset, block_len_in_page, 1, err); if (*err) goto drop_data_sem; if (!unwritten) { ext4_double_up_write_data_sem(orig_inode, donor_inode); goto data_copy; } if (!filemap_release_folio(folio[0], 0) || !filemap_release_folio(folio[1], 0)) { *err = -EBUSY; goto drop_data_sem; } replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode, orig_blk_offset, donor_blk_offset, block_len_in_page, 1, err); drop_data_sem: ext4_double_up_write_data_sem(orig_inode, donor_inode); goto unlock_folios; } data_copy: *err = mext_page_mkuptodate(&folio[0]->page, from, from + replaced_size); if (*err) goto unlock_folios; /* At this point all buffers in range are uptodate, old mapping layout * is no longer required, try to drop it now. */ if (!filemap_release_folio(folio[0], 0) || !filemap_release_folio(folio[1], 0)) { *err = -EBUSY; goto unlock_folios; } ext4_double_down_write_data_sem(orig_inode, donor_inode); replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode, orig_blk_offset, donor_blk_offset, block_len_in_page, 1, err); ext4_double_up_write_data_sem(orig_inode, donor_inode); if (*err) { if (replaced_count) { block_len_in_page = replaced_count; replaced_size = block_len_in_page << orig_inode->i_blkbits; } else goto unlock_folios; } /* Perform all necessary steps similar write_begin()/write_end() * but keeping in mind that i_size will not change */ if (!folio_buffers(folio[0])) create_empty_buffers(&folio[0]->page, 1 << orig_inode->i_blkbits, 0); bh = folio_buffers(folio[0]); for (i = 0; i < data_offset_in_page; i++) bh = bh->b_this_page; for (i = 0; i < block_len_in_page; i++) { *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0); if (*err < 0) break; bh = bh->b_this_page; } if (!*err) *err = block_commit_write(&folio[0]->page, from, from + replaced_size); if (unlikely(*err < 0)) goto repair_branches; /* Even in case of data=writeback it is reasonable to pin * inode to transaction, to prevent unexpected data loss */ *err = ext4_jbd2_inode_add_write(handle, orig_inode, (loff_t)orig_page_offset << PAGE_SHIFT, replaced_size); unlock_folios: folio_unlock(folio[0]); folio_put(folio[0]); folio_unlock(folio[1]); folio_put(folio[1]); stop_journal: ext4_journal_stop(handle); if (*err == -ENOSPC && ext4_should_retry_alloc(sb, &retries)) goto again; /* Buffer was busy because probably is pinned to journal transaction, * force transaction commit may help to free it. */ if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal && jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal)) goto again; return replaced_count; repair_branches: /* * This should never ever happen! * Extents are swapped already, but we are not able to copy data. * Try to swap extents to it's original places */ ext4_double_down_write_data_sem(orig_inode, donor_inode); replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode, orig_blk_offset, donor_blk_offset, block_len_in_page, 0, &err2); ext4_double_up_write_data_sem(orig_inode, donor_inode); if (replaced_count != block_len_in_page) { ext4_error_inode_block(orig_inode, (sector_t)(orig_blk_offset), EIO, "Unable to copy data block," " data will be lost."); *err = -EIO; } replaced_count = 0; goto unlock_folios; } /** * mext_check_arguments - Check whether move extent can be done * * @orig_inode: original inode * @donor_inode: donor inode * @orig_start: logical start offset in block for orig * @donor_start: logical start offset in block for donor * @len: the number of blocks to be moved * * Check the arguments of ext4_move_extents() whether the files can be * exchanged with each other. * Return 0 on success, or a negative error value on failure. */ static int mext_check_arguments(struct inode *orig_inode, struct inode *donor_inode, __u64 orig_start, __u64 donor_start, __u64 *len) { __u64 orig_eof, donor_eof; unsigned int blkbits = orig_inode->i_blkbits; unsigned int blocksize = 1 << blkbits; orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits; donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits; if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { ext4_debug("ext4 move extent: suid or sgid is set" " to donor file [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) return -EPERM; /* Ext4 move extent does not support swap files */ if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -ETXTBSY; } if (ext4_is_quota_file(orig_inode) && ext4_is_quota_file(donor_inode)) { ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EOPNOTSUPP; } /* Ext4 move extent supports only extent based file */ if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { ext4_debug("ext4 move extent: orig file is not extents " "based file [ino:orig %lu]\n", orig_inode->i_ino); return -EOPNOTSUPP; } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) { ext4_debug("ext4 move extent: donor file is not extents " "based file [ino:donor %lu]\n", donor_inode->i_ino); return -EOPNOTSUPP; } if ((!orig_inode->i_size) || (!donor_inode->i_size)) { ext4_debug("ext4 move extent: File size is 0 byte\n"); return -EINVAL; } /* Start offset should be same */ if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) != (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) { ext4_debug("ext4 move extent: orig and donor's start " "offsets are not aligned [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } if ((orig_start >= EXT_MAX_BLOCKS) || (donor_start >= EXT_MAX_BLOCKS) || (*len > EXT_MAX_BLOCKS) || (donor_start + *len >= EXT_MAX_BLOCKS) || (orig_start + *len >= EXT_MAX_BLOCKS)) { ext4_debug("ext4 move extent: Can't handle over [%u] blocks " "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } if (orig_eof <= orig_start) *len = 0; else if (orig_eof < orig_start + *len - 1) *len = orig_eof - orig_start; if (donor_eof <= donor_start) *len = 0; else if (donor_eof < donor_start + *len - 1) *len = donor_eof - donor_start; if (!*len) { ext4_debug("ext4 move extent: len should not be 0 " "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } return 0; } /** * ext4_move_extents - Exchange the specified range of a file * * @o_filp: file structure of the original file * @d_filp: file structure of the donor file * @orig_blk: start offset in block for orig * @donor_blk: start offset in block for donor * @len: the number of blocks to be moved * @moved_len: moved block length * * This function returns 0 and moved block length is set in moved_len * if succeed, otherwise returns error value. * */ int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, __u64 donor_blk, __u64 len, __u64 *moved_len) { struct inode *orig_inode = file_inode(o_filp); struct inode *donor_inode = file_inode(d_filp); struct ext4_ext_path *path = NULL; int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; ext4_lblk_t o_end, o_start = orig_blk; ext4_lblk_t d_start = donor_blk; int ret; if (orig_inode->i_sb != donor_inode->i_sb) { ext4_debug("ext4 move extent: The argument files " "should be in same FS [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } /* orig and donor should be different inodes */ if (orig_inode == donor_inode) { ext4_debug("ext4 move extent: The argument files should not " "be same inode [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } /* Regular file check */ if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { ext4_debug("ext4 move extent: The argument files should be " "regular file [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } /* TODO: it's not obvious how to swap blocks for inodes with full journaling enabled */ if (ext4_should_journal_data(orig_inode) || ext4_should_journal_data(donor_inode)) { ext4_msg(orig_inode->i_sb, KERN_ERR, "Online defrag not supported with data journaling"); return -EOPNOTSUPP; } if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) { ext4_msg(orig_inode->i_sb, KERN_ERR, "Online defrag not supported for encrypted files"); return -EOPNOTSUPP; } /* Protect orig and donor inodes against a truncate */ lock_two_nondirectories(orig_inode, donor_inode); /* Wait for all existing dio workers */ inode_dio_wait(orig_inode); inode_dio_wait(donor_inode); /* Protect extent tree against block allocations via delalloc */ ext4_double_down_write_data_sem(orig_inode, donor_inode); /* Check the filesystem environment whether move_extent can be done */ ret = mext_check_arguments(orig_inode, donor_inode, orig_blk, donor_blk, &len); if (ret) goto out; o_end = o_start + len; *moved_len = 0; while (o_start < o_end) { struct ext4_extent *ex; ext4_lblk_t cur_blk, next_blk; pgoff_t orig_page_index, donor_page_index; int offset_in_page; int unwritten, cur_len; ret = get_ext_path(orig_inode, o_start, &path); if (ret) goto out; ex = path[path->p_depth].p_ext; cur_blk = le32_to_cpu(ex->ee_block); cur_len = ext4_ext_get_actual_len(ex); /* Check hole before the start pos */ if (cur_blk + cur_len - 1 < o_start) { next_blk = ext4_ext_next_allocated_block(path); if (next_blk == EXT_MAX_BLOCKS) { ret = -ENODATA; goto out; } d_start += next_blk - o_start; o_start = next_blk; continue; /* Check hole after the start pos */ } else if (cur_blk > o_start) { /* Skip hole */ d_start += cur_blk - o_start; o_start = cur_blk; /* Extent inside requested range ?*/ if (cur_blk >= o_end) goto out; } else { /* in_range(o_start, o_blk, o_len) */ cur_len += cur_blk - o_start; } unwritten = ext4_ext_is_unwritten(ex); if (o_end - o_start < cur_len) cur_len = o_end - o_start; orig_page_index = o_start >> (PAGE_SHIFT - orig_inode->i_blkbits); donor_page_index = d_start >> (PAGE_SHIFT - donor_inode->i_blkbits); offset_in_page = o_start % blocks_per_page; if (cur_len > blocks_per_page - offset_in_page) cur_len = blocks_per_page - offset_in_page; /* * Up semaphore to avoid following problems: * a. transaction deadlock among ext4_journal_start, * ->write_begin via pagefault, and jbd2_journal_commit * b. racing with ->read_folio, ->write_begin, and * ext4_get_block in move_extent_per_page */ ext4_double_up_write_data_sem(orig_inode, donor_inode); /* Swap original branches with new branches */ *moved_len += move_extent_per_page(o_filp, donor_inode, orig_page_index, donor_page_index, offset_in_page, cur_len, unwritten, &ret); ext4_double_down_write_data_sem(orig_inode, donor_inode); if (ret < 0) break; o_start += cur_len; d_start += cur_len; } out: if (*moved_len) { ext4_discard_preallocations(orig_inode, 0); ext4_discard_preallocations(donor_inode, 0); } ext4_free_ext_path(path); ext4_double_up_write_data_sem(orig_inode, donor_inode); unlock_two_nondirectories(orig_inode, donor_inode); return ret; } |
4 403 505 1 32 548 40 2 1 1 2 545 186 528 461 144 9 461 145 503 30 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef ASM_KVM_CACHE_REGS_H #define ASM_KVM_CACHE_REGS_H #include <linux/kvm_host.h> #define KVM_POSSIBLE_CR0_GUEST_BITS (X86_CR0_TS | X86_CR0_WP) #define KVM_POSSIBLE_CR4_GUEST_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE) #define X86_CR0_PDPTR_BITS (X86_CR0_CD | X86_CR0_NW | X86_CR0_PG) #define X86_CR4_TLBFLUSH_BITS (X86_CR4_PGE | X86_CR4_PCIDE | X86_CR4_PAE | X86_CR4_SMEP) #define X86_CR4_PDPTR_BITS (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_SMEP) static_assert(!(KVM_POSSIBLE_CR0_GUEST_BITS & X86_CR0_PDPTR_BITS)); #define BUILD_KVM_GPR_ACCESSORS(lname, uname) \ static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\ { \ return vcpu->arch.regs[VCPU_REGS_##uname]; \ } \ static __always_inline void kvm_##lname##_write(struct kvm_vcpu *vcpu, \ unsigned long val) \ { \ vcpu->arch.regs[VCPU_REGS_##uname] = val; \ } BUILD_KVM_GPR_ACCESSORS(rax, RAX) BUILD_KVM_GPR_ACCESSORS(rbx, RBX) BUILD_KVM_GPR_ACCESSORS(rcx, RCX) BUILD_KVM_GPR_ACCESSORS(rdx, RDX) BUILD_KVM_GPR_ACCESSORS(rbp, RBP) BUILD_KVM_GPR_ACCESSORS(rsi, RSI) BUILD_KVM_GPR_ACCESSORS(rdi, RDI) #ifdef CONFIG_X86_64 BUILD_KVM_GPR_ACCESSORS(r8, R8) BUILD_KVM_GPR_ACCESSORS(r9, R9) BUILD_KVM_GPR_ACCESSORS(r10, R10) BUILD_KVM_GPR_ACCESSORS(r11, R11) BUILD_KVM_GPR_ACCESSORS(r12, R12) BUILD_KVM_GPR_ACCESSORS(r13, R13) BUILD_KVM_GPR_ACCESSORS(r14, R14) BUILD_KVM_GPR_ACCESSORS(r15, R15) #endif /* * avail dirty * 0 0 register in VMCS/VMCB * 0 1 *INVALID* * 1 0 register in vcpu->arch * 1 1 register in vcpu->arch, needs to be stored back */ static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu, enum kvm_reg reg) { return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); } static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu, enum kvm_reg reg) { return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); } static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu, enum kvm_reg reg) { __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); } static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu, enum kvm_reg reg) { __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); } /* * The "raw" register helpers are only for cases where the full 64 bits of a * register are read/written irrespective of current vCPU mode. In other words, * odds are good you shouldn't be using the raw variants. */ static inline unsigned long kvm_register_read_raw(struct kvm_vcpu *vcpu, int reg) { if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS)) return 0; if (!kvm_register_is_available(vcpu, reg)) static_call(kvm_x86_cache_reg)(vcpu, reg); return vcpu->arch.regs[reg]; } static inline void kvm_register_write_raw(struct kvm_vcpu *vcpu, int reg, unsigned long val) { if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS)) return; vcpu->arch.regs[reg] = val; kvm_register_mark_dirty(vcpu, reg); } static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu) { return kvm_register_read_raw(vcpu, VCPU_REGS_RIP); } static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) { kvm_register_write_raw(vcpu, VCPU_REGS_RIP, val); } static inline unsigned long kvm_rsp_read(struct kvm_vcpu *vcpu) { return kvm_register_read_raw(vcpu, VCPU_REGS_RSP); } static inline void kvm_rsp_write(struct kvm_vcpu *vcpu, unsigned long val) { kvm_register_write_raw(vcpu, VCPU_REGS_RSP, val); } static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) { might_sleep(); /* on svm */ if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR)) static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_PDPTR); return vcpu->arch.walk_mmu->pdptrs[index]; } static inline void kvm_pdptr_write(struct kvm_vcpu *vcpu, int index, u64 value) { vcpu->arch.walk_mmu->pdptrs[index] = value; } static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) { ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; if ((tmask & vcpu->arch.cr0_guest_owned_bits) && !kvm_register_is_available(vcpu, VCPU_EXREG_CR0)) static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR0); return vcpu->arch.cr0 & mask; } static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu) { return kvm_read_cr0_bits(vcpu, ~0UL); } static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) { ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS; if ((tmask & vcpu->arch.cr4_guest_owned_bits) && !kvm_register_is_available(vcpu, VCPU_EXREG_CR4)) static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR4); return vcpu->arch.cr4 & mask; } static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu) { if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR3); return vcpu->arch.cr3; } static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) { return kvm_read_cr4_bits(vcpu, ~0UL); } static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) { return (kvm_rax_read(vcpu) & -1u) | ((u64)(kvm_rdx_read(vcpu) & -1u) << 32); } static inline void enter_guest_mode(struct kvm_vcpu *vcpu) { vcpu->arch.hflags |= HF_GUEST_MASK; vcpu->stat.guest_mode = 1; } static inline void leave_guest_mode(struct kvm_vcpu *vcpu) { vcpu->arch.hflags &= ~HF_GUEST_MASK; if (vcpu->arch.load_eoi_exitmap_pending) { vcpu->arch.load_eoi_exitmap_pending = false; kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu); } vcpu->stat.guest_mode = 0; } static inline bool is_guest_mode(struct kvm_vcpu *vcpu) { return vcpu->arch.hflags & HF_GUEST_MASK; } static inline bool is_smm(struct kvm_vcpu *vcpu) { return vcpu->arch.hflags & HF_SMM_MASK; } #endif |
20 19 19 20 20 18 2 18 7 12 18 18 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 | // SPDX-License-Identifier: GPL-2.0-only /* * LED Class Core * * Copyright (C) 2005 John Lenz <lenz@cs.wisc.edu> * Copyright (C) 2005-2007 Richard Purdie <rpurdie@openedhand.com> */ #include <linux/ctype.h> #include <linux/device.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/leds.h> #include <linux/list.h> #include <linux/module.h> #include <linux/property.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/timer.h> #include <uapi/linux/uleds.h> #include <linux/of.h> #include "leds.h" static struct class *leds_class; static ssize_t brightness_show(struct device *dev, struct device_attribute *attr, char *buf) { struct led_classdev *led_cdev = dev_get_drvdata(dev); unsigned int brightness; mutex_lock(&led_cdev->led_access); led_update_brightness(led_cdev); brightness = led_cdev->brightness; mutex_unlock(&led_cdev->led_access); return sprintf(buf, "%u\n", brightness); } static ssize_t brightness_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t size) { struct led_classdev *led_cdev = dev_get_drvdata(dev); unsigned long state; ssize_t ret; mutex_lock(&led_cdev->led_access); if (led_sysfs_is_disabled(led_cdev)) { ret = -EBUSY; goto unlock; } ret = kstrtoul(buf, 10, &state); if (ret) goto unlock; if (state == LED_OFF) led_trigger_remove(led_cdev); led_set_brightness(led_cdev, state); flush_work(&led_cdev->set_brightness_work); ret = size; unlock: mutex_unlock(&led_cdev->led_access); return ret; } static DEVICE_ATTR_RW(brightness); static ssize_t max_brightness_show(struct device *dev, struct device_attribute *attr, char *buf) { struct led_classdev *led_cdev = dev_get_drvdata(dev); unsigned int max_brightness; mutex_lock(&led_cdev->led_access); max_brightness = led_cdev->max_brightness; mutex_unlock(&led_cdev->led_access); return sprintf(buf, "%u\n", max_brightness); } static DEVICE_ATTR_RO(max_brightness); #ifdef CONFIG_LEDS_TRIGGERS static BIN_ATTR(trigger, 0644, led_trigger_read, led_trigger_write, 0); static struct bin_attribute *led_trigger_bin_attrs[] = { &bin_attr_trigger, NULL, }; static const struct attribute_group led_trigger_group = { .bin_attrs = led_trigger_bin_attrs, }; #endif static struct attribute *led_class_attrs[] = { &dev_attr_brightness.attr, &dev_attr_max_brightness.attr, NULL, }; static const struct attribute_group led_group = { .attrs = led_class_attrs, }; static const struct attribute_group *led_groups[] = { &led_group, #ifdef CONFIG_LEDS_TRIGGERS &led_trigger_group, #endif NULL, }; #ifdef CONFIG_LEDS_BRIGHTNESS_HW_CHANGED static ssize_t brightness_hw_changed_show(struct device *dev, struct device_attribute *attr, char *buf) { struct led_classdev *led_cdev = dev_get_drvdata(dev); if (led_cdev->brightness_hw_changed == -1) return -ENODATA; return sprintf(buf, "%u\n", led_cdev->brightness_hw_changed); } static DEVICE_ATTR_RO(brightness_hw_changed); static int led_add_brightness_hw_changed(struct led_classdev *led_cdev) { struct device *dev = led_cdev->dev; int ret; ret = device_create_file(dev, &dev_attr_brightness_hw_changed); if (ret) { dev_err(dev, "Error creating brightness_hw_changed\n"); return ret; } led_cdev->brightness_hw_changed_kn = sysfs_get_dirent(dev->kobj.sd, "brightness_hw_changed"); if (!led_cdev->brightness_hw_changed_kn) { dev_err(dev, "Error getting brightness_hw_changed kn\n"); device_remove_file(dev, &dev_attr_brightness_hw_changed); return -ENXIO; } return 0; } static void led_remove_brightness_hw_changed(struct led_classdev *led_cdev) { sysfs_put(led_cdev->brightness_hw_changed_kn); device_remove_file(led_cdev->dev, &dev_attr_brightness_hw_changed); } void led_classdev_notify_brightness_hw_changed(struct led_classdev *led_cdev, unsigned int brightness) { if (WARN_ON(!led_cdev->brightness_hw_changed_kn)) return; led_cdev->brightness_hw_changed = brightness; sysfs_notify_dirent(led_cdev->brightness_hw_changed_kn); } EXPORT_SYMBOL_GPL(led_classdev_notify_brightness_hw_changed); #else static int led_add_brightness_hw_changed(struct led_classdev *led_cdev) { return 0; } static void led_remove_brightness_hw_changed(struct led_classdev *led_cdev) { } #endif /** * led_classdev_suspend - suspend an led_classdev. * @led_cdev: the led_classdev to suspend. */ void led_classdev_suspend(struct led_classdev *led_cdev) { led_cdev->flags |= LED_SUSPENDED; led_set_brightness_nopm(led_cdev, 0); flush_work(&led_cdev->set_brightness_work); } EXPORT_SYMBOL_GPL(led_classdev_suspend); /** * led_classdev_resume - resume an led_classdev. * @led_cdev: the led_classdev to resume. */ void led_classdev_resume(struct led_classdev *led_cdev) { led_set_brightness_nopm(led_cdev, led_cdev->brightness); if (led_cdev->flash_resume) led_cdev->flash_resume(led_cdev); led_cdev->flags &= ~LED_SUSPENDED; } EXPORT_SYMBOL_GPL(led_classdev_resume); #ifdef CONFIG_PM_SLEEP static int led_suspend(struct device *dev) { struct led_classdev *led_cdev = dev_get_drvdata(dev); if (led_cdev->flags & LED_CORE_SUSPENDRESUME) led_classdev_suspend(led_cdev); return 0; } static int led_resume(struct device *dev) { struct led_classdev *led_cdev = dev_get_drvdata(dev); if (led_cdev->flags & LED_CORE_SUSPENDRESUME) led_classdev_resume(led_cdev); return 0; } #endif static SIMPLE_DEV_PM_OPS(leds_class_dev_pm_ops, led_suspend, led_resume); /** * of_led_get() - request a LED device via the LED framework * @np: device node to get the LED device from * @index: the index of the LED * * Returns the LED device parsed from the phandle specified in the "leds" * property of a device tree node or a negative error-code on failure. */ struct led_classdev *of_led_get(struct device_node *np, int index) { struct device *led_dev; struct led_classdev *led_cdev; struct device_node *led_node; led_node = of_parse_phandle(np, "leds", index); if (!led_node) return ERR_PTR(-ENOENT); led_dev = class_find_device_by_of_node(leds_class, led_node); of_node_put(led_node); if (!led_dev) return ERR_PTR(-EPROBE_DEFER); led_cdev = dev_get_drvdata(led_dev); if (!try_module_get(led_cdev->dev->parent->driver->owner)) { put_device(led_cdev->dev); return ERR_PTR(-ENODEV); } return led_cdev; } EXPORT_SYMBOL_GPL(of_led_get); /** * led_put() - release a LED device * @led_cdev: LED device */ void led_put(struct led_classdev *led_cdev) { module_put(led_cdev->dev->parent->driver->owner); put_device(led_cdev->dev); } EXPORT_SYMBOL_GPL(led_put); static void devm_led_release(struct device *dev, void *res) { struct led_classdev **p = res; led_put(*p); } /** * devm_of_led_get - Resource-managed request of a LED device * @dev: LED consumer * @index: index of the LED to obtain in the consumer * * The device node of the device is parse to find the request LED device. * The LED device returned from this function is automatically released * on driver detach. * * @return a pointer to a LED device or ERR_PTR(errno) on failure. */ struct led_classdev *__must_check devm_of_led_get(struct device *dev, int index) { struct led_classdev *led; struct led_classdev **dr; if (!dev) return ERR_PTR(-EINVAL); led = of_led_get(dev->of_node, index); if (IS_ERR(led)) return led; dr = devres_alloc(devm_led_release, sizeof(struct led_classdev *), GFP_KERNEL); if (!dr) { led_put(led); return ERR_PTR(-ENOMEM); } *dr = led; devres_add(dev, dr); return led; } EXPORT_SYMBOL_GPL(devm_of_led_get); static int led_classdev_next_name(const char *init_name, char *name, size_t len) { unsigned int i = 0; int ret = 0; struct device *dev; strlcpy(name, init_name, len); while ((ret < len) && (dev = class_find_device_by_name(leds_class, name))) { put_device(dev); ret = snprintf(name, len, "%s_%u", init_name, ++i); } if (ret >= len) return -ENOMEM; return i; } /** * led_classdev_register_ext - register a new object of led_classdev class * with init data. * * @parent: parent of LED device * @led_cdev: the led_classdev structure for this device. * @init_data: LED class device initialization data */ int led_classdev_register_ext(struct device *parent, struct led_classdev *led_cdev, struct led_init_data *init_data) { char composed_name[LED_MAX_NAME_SIZE]; char final_name[LED_MAX_NAME_SIZE]; const char *proposed_name = composed_name; int ret; if (init_data) { if (init_data->devname_mandatory && !init_data->devicename) { dev_err(parent, "Mandatory device name is missing"); return -EINVAL; } ret = led_compose_name(parent, init_data, composed_name); if (ret < 0) return ret; if (init_data->fwnode) { fwnode_property_read_string(init_data->fwnode, "linux,default-trigger", &led_cdev->default_trigger); if (fwnode_property_present(init_data->fwnode, "retain-state-shutdown")) led_cdev->flags |= LED_RETAIN_AT_SHUTDOWN; } } else { proposed_name = led_cdev->name; } ret = led_classdev_next_name(proposed_name, final_name, sizeof(final_name)); if (ret < 0) return ret; mutex_init(&led_cdev->led_access); mutex_lock(&led_cdev->led_access); led_cdev->dev = device_create_with_groups(leds_class, parent, 0, led_cdev, led_cdev->groups, "%s", final_name); if (IS_ERR(led_cdev->dev)) { mutex_unlock(&led_cdev->led_access); return PTR_ERR(led_cdev->dev); } if (init_data && init_data->fwnode) device_set_node(led_cdev->dev, init_data->fwnode); if (ret) dev_warn(parent, "Led %s renamed to %s due to name collision", proposed_name, dev_name(led_cdev->dev)); if (led_cdev->flags & LED_BRIGHT_HW_CHANGED) { ret = led_add_brightness_hw_changed(led_cdev); if (ret) { device_unregister(led_cdev->dev); led_cdev->dev = NULL; mutex_unlock(&led_cdev->led_access); return ret; } } led_cdev->work_flags = 0; #ifdef CONFIG_LEDS_TRIGGERS init_rwsem(&led_cdev->trigger_lock); #endif #ifdef CONFIG_LEDS_BRIGHTNESS_HW_CHANGED led_cdev->brightness_hw_changed = -1; #endif /* add to the list of leds */ down_write(&leds_list_lock); list_add_tail(&led_cdev->node, &leds_list); up_write(&leds_list_lock); if (!led_cdev->max_brightness) led_cdev->max_brightness = LED_FULL; led_update_brightness(led_cdev); led_init_core(led_cdev); #ifdef CONFIG_LEDS_TRIGGERS led_trigger_set_default(led_cdev); #endif mutex_unlock(&led_cdev->led_access); dev_dbg(parent, "Registered led device: %s\n", led_cdev->name); return 0; } EXPORT_SYMBOL_GPL(led_classdev_register_ext); /** * led_classdev_unregister - unregisters a object of led_properties class. * @led_cdev: the led device to unregister * * Unregisters a previously registered via led_classdev_register object. */ void led_classdev_unregister(struct led_classdev *led_cdev) { if (IS_ERR_OR_NULL(led_cdev->dev)) return; #ifdef CONFIG_LEDS_TRIGGERS down_write(&led_cdev->trigger_lock); if (led_cdev->trigger) led_trigger_set(led_cdev, NULL); up_write(&led_cdev->trigger_lock); #endif led_cdev->flags |= LED_UNREGISTERING; /* Stop blinking */ led_stop_software_blink(led_cdev); if (!(led_cdev->flags & LED_RETAIN_AT_SHUTDOWN)) led_set_brightness(led_cdev, LED_OFF); flush_work(&led_cdev->set_brightness_work); if (led_cdev->flags & LED_BRIGHT_HW_CHANGED) led_remove_brightness_hw_changed(led_cdev); device_unregister(led_cdev->dev); down_write(&leds_list_lock); list_del(&led_cdev->node); up_write(&leds_list_lock); mutex_destroy(&led_cdev->led_access); } EXPORT_SYMBOL_GPL(led_classdev_unregister); static void devm_led_classdev_release(struct device *dev, void *res) { led_classdev_unregister(*(struct led_classdev **)res); } /** * devm_led_classdev_register_ext - resource managed led_classdev_register_ext() * * @parent: parent of LED device * @led_cdev: the led_classdev structure for this device. * @init_data: LED class device initialization data */ int devm_led_classdev_register_ext(struct device *parent, struct led_classdev *led_cdev, struct led_init_data *init_data) { struct led_classdev **dr; int rc; dr = devres_alloc(devm_led_classdev_release, sizeof(*dr), GFP_KERNEL); if (!dr) return -ENOMEM; rc = led_classdev_register_ext(parent, led_cdev, init_data); if (rc) { devres_free(dr); return rc; } *dr = led_cdev; devres_add(parent, dr); return 0; } EXPORT_SYMBOL_GPL(devm_led_classdev_register_ext); static int devm_led_classdev_match(struct device *dev, void *res, void *data) { struct led_classdev **p = res; if (WARN_ON(!p || !*p)) return 0; return *p == data; } /** * devm_led_classdev_unregister() - resource managed led_classdev_unregister() * @dev: The device to unregister. * @led_cdev: the led_classdev structure for this device. */ void devm_led_classdev_unregister(struct device *dev, struct led_classdev *led_cdev) { WARN_ON(devres_release(dev, devm_led_classdev_release, devm_led_classdev_match, led_cdev)); } EXPORT_SYMBOL_GPL(devm_led_classdev_unregister); static int __init leds_init(void) { leds_class = class_create(THIS_MODULE, "leds"); if (IS_ERR(leds_class)) return PTR_ERR(leds_class); leds_class->pm = &leds_class_dev_pm_ops; leds_class->dev_groups = led_groups; return 0; } static void __exit leds_exit(void) { class_destroy(leds_class); } subsys_initcall(leds_init); module_exit(leds_exit); MODULE_AUTHOR("John Lenz, Richard Purdie"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("LED Class Interface"); |
1 15 3 2 1 1 15 1 1 13 1 50 21 2 16 2 1 21 8 35 10 26 23 10 14 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/ematch.c Extended Match API * * Authors: Thomas Graf <tgraf@suug.ch> * * ========================================================================== * * An extended match (ematch) is a small classification tool not worth * writing a full classifier for. Ematches can be interconnected to form * a logic expression and get attached to classifiers to extend their * functionatlity. * * The userspace part transforms the logic expressions into an array * consisting of multiple sequences of interconnected ematches separated * by markers. Precedence is implemented by a special ematch kind * referencing a sequence beyond the marker of the current sequence * causing the current position in the sequence to be pushed onto a stack * to allow the current position to be overwritten by the position referenced * in the special ematch. Matching continues in the new sequence until a * marker is reached causing the position to be restored from the stack. * * Example: * A AND (B1 OR B2) AND C AND D * * ------->-PUSH------- * -->-- / -->-- \ -->-- * / \ / / \ \ / \ * +-------+-------+-------+-------+-------+--------+ * | A AND | B AND | C AND | D END | B1 OR | B2 END | * +-------+-------+-------+-------+-------+--------+ * \ / * --------<-POP--------- * * where B is a virtual ematch referencing to sequence starting with B1. * * ========================================================================== * * How to write an ematch in 60 seconds * ------------------------------------ * * 1) Provide a matcher function: * static int my_match(struct sk_buff *skb, struct tcf_ematch *m, * struct tcf_pkt_info *info) * { * struct mydata *d = (struct mydata *) m->data; * * if (...matching goes here...) * return 1; * else * return 0; * } * * 2) Fill out a struct tcf_ematch_ops: * static struct tcf_ematch_ops my_ops = { * .kind = unique id, * .datalen = sizeof(struct mydata), * .match = my_match, * .owner = THIS_MODULE, * }; * * 3) Register/Unregister your ematch: * static int __init init_my_ematch(void) * { * return tcf_em_register(&my_ops); * } * * static void __exit exit_my_ematch(void) * { * tcf_em_unregister(&my_ops); * } * * module_init(init_my_ematch); * module_exit(exit_my_ematch); * * 4) By now you should have two more seconds left, barely enough to * open up a beer to watch the compilation going. */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <net/pkt_cls.h> static LIST_HEAD(ematch_ops); static DEFINE_RWLOCK(ematch_mod_lock); static struct tcf_ematch_ops *tcf_em_lookup(u16 kind) { struct tcf_ematch_ops *e = NULL; read_lock(&ematch_mod_lock); list_for_each_entry(e, &ematch_ops, link) { if (kind == e->kind) { if (!try_module_get(e->owner)) e = NULL; read_unlock(&ematch_mod_lock); return e; } } read_unlock(&ematch_mod_lock); return NULL; } /** * tcf_em_register - register an extended match * * @ops: ematch operations lookup table * * This function must be called by ematches to announce their presence. * The given @ops must have kind set to a unique identifier and the * callback match() must be implemented. All other callbacks are optional * and a fallback implementation is used instead. * * Returns -EEXISTS if an ematch of the same kind has already registered. */ int tcf_em_register(struct tcf_ematch_ops *ops) { int err = -EEXIST; struct tcf_ematch_ops *e; if (ops->match == NULL) return -EINVAL; write_lock(&ematch_mod_lock); list_for_each_entry(e, &ematch_ops, link) if (ops->kind == e->kind) goto errout; list_add_tail(&ops->link, &ematch_ops); err = 0; errout: write_unlock(&ematch_mod_lock); return err; } EXPORT_SYMBOL(tcf_em_register); /** * tcf_em_unregister - unregister and extended match * * @ops: ematch operations lookup table * * This function must be called by ematches to announce their disappearance * for examples when the module gets unloaded. The @ops parameter must be * the same as the one used for registration. * * Returns -ENOENT if no matching ematch was found. */ void tcf_em_unregister(struct tcf_ematch_ops *ops) { write_lock(&ematch_mod_lock); list_del(&ops->link); write_unlock(&ematch_mod_lock); } EXPORT_SYMBOL(tcf_em_unregister); static inline struct tcf_ematch *tcf_em_get_match(struct tcf_ematch_tree *tree, int index) { return &tree->matches[index]; } static int tcf_em_validate(struct tcf_proto *tp, struct tcf_ematch_tree_hdr *tree_hdr, struct tcf_ematch *em, struct nlattr *nla, int idx) { int err = -EINVAL; struct tcf_ematch_hdr *em_hdr = nla_data(nla); int data_len = nla_len(nla) - sizeof(*em_hdr); void *data = (void *) em_hdr + sizeof(*em_hdr); struct net *net = tp->chain->block->net; if (!TCF_EM_REL_VALID(em_hdr->flags)) goto errout; if (em_hdr->kind == TCF_EM_CONTAINER) { /* Special ematch called "container", carries an index * referencing an external ematch sequence. */ u32 ref; if (data_len < sizeof(ref)) goto errout; ref = *(u32 *) data; if (ref >= tree_hdr->nmatches) goto errout; /* We do not allow backward jumps to avoid loops and jumps * to our own position are of course illegal. */ if (ref <= idx) goto errout; em->data = ref; } else { /* Note: This lookup will increase the module refcnt * of the ematch module referenced. In case of a failure, * a destroy function is called by the underlying layer * which automatically releases the reference again, therefore * the module MUST not be given back under any circumstances * here. Be aware, the destroy function assumes that the * module is held if the ops field is non zero. */ em->ops = tcf_em_lookup(em_hdr->kind); if (em->ops == NULL) { err = -ENOENT; #ifdef CONFIG_MODULES __rtnl_unlock(); request_module("ematch-kind-%u", em_hdr->kind); rtnl_lock(); em->ops = tcf_em_lookup(em_hdr->kind); if (em->ops) { /* We dropped the RTNL mutex in order to * perform the module load. Tell the caller * to replay the request. */ module_put(em->ops->owner); em->ops = NULL; err = -EAGAIN; } #endif goto errout; } /* ematch module provides expected length of data, so we * can do a basic sanity check. */ if (em->ops->datalen && data_len < em->ops->datalen) goto errout; if (em->ops->change) { err = -EINVAL; if (em_hdr->flags & TCF_EM_SIMPLE) goto errout; err = em->ops->change(net, data, data_len, em); if (err < 0) goto errout; } else if (data_len > 0) { /* ematch module doesn't provide an own change * procedure and expects us to allocate and copy * the ematch data. * * TCF_EM_SIMPLE may be specified stating that the * data only consists of a u32 integer and the module * does not expected a memory reference but rather * the value carried. */ if (em_hdr->flags & TCF_EM_SIMPLE) { if (em->ops->datalen > 0) goto errout; if (data_len < sizeof(u32)) goto errout; em->data = *(u32 *) data; } else { void *v = kmemdup(data, data_len, GFP_KERNEL); if (v == NULL) { err = -ENOBUFS; goto errout; } em->data = (unsigned long) v; } em->datalen = data_len; } } em->matchid = em_hdr->matchid; em->flags = em_hdr->flags; em->net = net; err = 0; errout: return err; } static const struct nla_policy em_policy[TCA_EMATCH_TREE_MAX + 1] = { [TCA_EMATCH_TREE_HDR] = { .len = sizeof(struct tcf_ematch_tree_hdr) }, [TCA_EMATCH_TREE_LIST] = { .type = NLA_NESTED }, }; /** * tcf_em_tree_validate - validate ematch config TLV and build ematch tree * * @tp: classifier kind handle * @nla: ematch tree configuration TLV * @tree: destination ematch tree variable to store the resulting * ematch tree. * * This function validates the given configuration TLV @nla and builds an * ematch tree in @tree. The resulting tree must later be copied into * the private classifier data using tcf_em_tree_change(). You MUST NOT * provide the ematch tree variable of the private classifier data directly, * the changes would not be locked properly. * * Returns a negative error code if the configuration TLV contains errors. */ int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla, struct tcf_ematch_tree *tree) { int idx, list_len, matches_len, err; struct nlattr *tb[TCA_EMATCH_TREE_MAX + 1]; struct nlattr *rt_match, *rt_hdr, *rt_list; struct tcf_ematch_tree_hdr *tree_hdr; struct tcf_ematch *em; memset(tree, 0, sizeof(*tree)); if (!nla) return 0; err = nla_parse_nested_deprecated(tb, TCA_EMATCH_TREE_MAX, nla, em_policy, NULL); if (err < 0) goto errout; err = -EINVAL; rt_hdr = tb[TCA_EMATCH_TREE_HDR]; rt_list = tb[TCA_EMATCH_TREE_LIST]; if (rt_hdr == NULL || rt_list == NULL) goto errout; tree_hdr = nla_data(rt_hdr); memcpy(&tree->hdr, tree_hdr, sizeof(*tree_hdr)); rt_match = nla_data(rt_list); list_len = nla_len(rt_list); matches_len = tree_hdr->nmatches * sizeof(*em); tree->matches = kzalloc(matches_len, GFP_KERNEL); if (tree->matches == NULL) goto errout; /* We do not use nla_parse_nested here because the maximum * number of attributes is unknown. This saves us the allocation * for a tb buffer which would serve no purpose at all. * * The array of rt attributes is parsed in the order as they are * provided, their type must be incremental from 1 to n. Even * if it does not serve any real purpose, a failure of sticking * to this policy will result in parsing failure. */ for (idx = 0; nla_ok(rt_match, list_len); idx++) { err = -EINVAL; if (rt_match->nla_type != (idx + 1)) goto errout_abort; if (idx >= tree_hdr->nmatches) goto errout_abort; if (nla_len(rt_match) < sizeof(struct tcf_ematch_hdr)) goto errout_abort; em = tcf_em_get_match(tree, idx); err = tcf_em_validate(tp, tree_hdr, em, rt_match, idx); if (err < 0) goto errout_abort; rt_match = nla_next(rt_match, &list_len); } /* Check if the number of matches provided by userspace actually * complies with the array of matches. The number was used for * the validation of references and a mismatch could lead to * undefined references during the matching process. */ if (idx != tree_hdr->nmatches) { err = -EINVAL; goto errout_abort; } err = 0; errout: return err; errout_abort: tcf_em_tree_destroy(tree); return err; } EXPORT_SYMBOL(tcf_em_tree_validate); /** * tcf_em_tree_destroy - destroy an ematch tree * * @tree: ematch tree to be deleted * * This functions destroys an ematch tree previously created by * tcf_em_tree_validate()/tcf_em_tree_change(). You must ensure that * the ematch tree is not in use before calling this function. */ void tcf_em_tree_destroy(struct tcf_ematch_tree *tree) { int i; if (tree->matches == NULL) return; for (i = 0; i < tree->hdr.nmatches; i++) { struct tcf_ematch *em = tcf_em_get_match(tree, i); if (em->ops) { if (em->ops->destroy) em->ops->destroy(em); else if (!tcf_em_is_simple(em)) kfree((void *) em->data); module_put(em->ops->owner); } } tree->hdr.nmatches = 0; kfree(tree->matches); tree->matches = NULL; } EXPORT_SYMBOL(tcf_em_tree_destroy); /** * tcf_em_tree_dump - dump ematch tree into a rtnl message * * @skb: skb holding the rtnl message * @tree: ematch tree to be dumped * @tlv: TLV type to be used to encapsulate the tree * * This function dumps a ematch tree into a rtnl message. It is valid to * call this function while the ematch tree is in use. * * Returns -1 if the skb tailroom is insufficient. */ int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv) { int i; u8 *tail; struct nlattr *top_start; struct nlattr *list_start; top_start = nla_nest_start_noflag(skb, tlv); if (top_start == NULL) goto nla_put_failure; if (nla_put(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr)) goto nla_put_failure; list_start = nla_nest_start_noflag(skb, TCA_EMATCH_TREE_LIST); if (list_start == NULL) goto nla_put_failure; tail = skb_tail_pointer(skb); for (i = 0; i < tree->hdr.nmatches; i++) { struct nlattr *match_start = (struct nlattr *)tail; struct tcf_ematch *em = tcf_em_get_match(tree, i); struct tcf_ematch_hdr em_hdr = { .kind = em->ops ? em->ops->kind : TCF_EM_CONTAINER, .matchid = em->matchid, .flags = em->flags }; if (nla_put(skb, i + 1, sizeof(em_hdr), &em_hdr)) goto nla_put_failure; if (em->ops && em->ops->dump) { if (em->ops->dump(skb, em) < 0) goto nla_put_failure; } else if (tcf_em_is_container(em) || tcf_em_is_simple(em)) { u32 u = em->data; nla_put_nohdr(skb, sizeof(u), &u); } else if (em->datalen > 0) nla_put_nohdr(skb, em->datalen, (void *) em->data); tail = skb_tail_pointer(skb); match_start->nla_len = tail - (u8 *)match_start; } nla_nest_end(skb, list_start); nla_nest_end(skb, top_start); return 0; nla_put_failure: return -1; } EXPORT_SYMBOL(tcf_em_tree_dump); static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *em, struct tcf_pkt_info *info) { int r = em->ops->match(skb, em, info); return tcf_em_is_inverted(em) ? !r : r; } /* Do not use this function directly, use tcf_em_tree_match instead */ int __tcf_em_tree_match(struct sk_buff *skb, struct tcf_ematch_tree *tree, struct tcf_pkt_info *info) { int stackp = 0, match_idx = 0, res = 0; struct tcf_ematch *cur_match; int stack[CONFIG_NET_EMATCH_STACK]; proceed: while (match_idx < tree->hdr.nmatches) { cur_match = tcf_em_get_match(tree, match_idx); if (tcf_em_is_container(cur_match)) { if (unlikely(stackp >= CONFIG_NET_EMATCH_STACK)) goto stack_overflow; stack[stackp++] = match_idx; match_idx = cur_match->data; goto proceed; } res = tcf_em_match(skb, cur_match, info); if (tcf_em_early_end(cur_match, res)) break; match_idx++; } pop_stack: if (stackp > 0) { match_idx = stack[--stackp]; cur_match = tcf_em_get_match(tree, match_idx); if (tcf_em_is_inverted(cur_match)) res = !res; if (tcf_em_early_end(cur_match, res)) { goto pop_stack; } else { match_idx++; goto proceed; } } return res; stack_overflow: net_warn_ratelimited("tc ematch: local stack overflow, increase NET_EMATCH_STACK\n"); return -1; } EXPORT_SYMBOL(__tcf_em_tree_match); |
49 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 | /* SPDX-License-Identifier: GPL-2.0-only */ /* CPU virtualization extensions handling * * This should carry the code for handling CPU virtualization extensions * that needs to live in the kernel core. * * Author: Eduardo Habkost <ehabkost@redhat.com> * * Copyright (C) 2008, Red Hat Inc. * * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc. */ #ifndef _ASM_X86_VIRTEX_H #define _ASM_X86_VIRTEX_H #include <asm/processor.h> #include <asm/vmx.h> #include <asm/svm.h> #include <asm/tlbflush.h> /* * VMX functions: */ static inline int cpu_has_vmx(void) { unsigned long ecx = cpuid_ecx(1); return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ } /** * cpu_vmxoff() - Disable VMX on the current CPU * * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) * * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to * atomically track post-VMXON state, e.g. this may be called in NMI context. * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. * faults are guaranteed to be due to the !post-VMXON check unless the CPU is * magically in RM, VM86, compat mode, or at CPL>0. */ static inline int cpu_vmxoff(void) { asm goto("1: vmxoff\n\t" _ASM_EXTABLE(1b, %l[fault]) ::: "cc", "memory" : fault); cr4_clear_bits(X86_CR4_VMXE); return 0; fault: cr4_clear_bits(X86_CR4_VMXE); return -EIO; } static inline int cpu_vmx_enabled(void) { return __read_cr4() & X86_CR4_VMXE; } /** Disable VMX if it is enabled on the current CPU * * You shouldn't call this if cpu_has_vmx() returns 0. */ static inline void __cpu_emergency_vmxoff(void) { if (cpu_vmx_enabled()) cpu_vmxoff(); } /** Disable VMX if it is supported and enabled on the current CPU */ static inline void cpu_emergency_vmxoff(void) { if (cpu_has_vmx()) __cpu_emergency_vmxoff(); } /* * SVM functions: */ /** Check if the CPU has SVM support * * You can use the 'msg' arg to get a message describing the problem, * if the function returns zero. Simply pass NULL if you are not interested * on the messages; gcc should take care of not generating code for * the messages on this case. */ static inline int cpu_has_svm(const char **msg) { if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) { if (msg) *msg = "not amd or hygon"; return 0; } if (!boot_cpu_has(X86_FEATURE_SVM)) { if (msg) *msg = "svm not available"; return 0; } return 1; } /** Disable SVM on the current CPU * * You should call this only if cpu_has_svm() returned true. */ static inline void cpu_svm_disable(void) { uint64_t efer; wrmsrl(MSR_VM_HSAVE_PA, 0); rdmsrl(MSR_EFER, efer); if (efer & EFER_SVME) { /* * Force GIF=1 prior to disabling SVM to ensure INIT and NMI * aren't blocked, e.g. if a fatal error occurred between CLGI * and STGI. Note, STGI may #UD if SVM is disabled from NMI * context between reading EFER and executing STGI. In that * case, GIF must already be set, otherwise the NMI would have * been blocked, so just eat the fault. */ asm goto("1: stgi\n\t" _ASM_EXTABLE(1b, %l[fault]) ::: "memory" : fault); fault: wrmsrl(MSR_EFER, efer & ~EFER_SVME); } } /** Makes sure SVM is disabled, if it is supported on the CPU */ static inline void cpu_emergency_svm_disable(void) { if (cpu_has_svm(NULL)) cpu_svm_disable(); } #endif /* _ASM_X86_VIRTEX_H */ |
7 6 7 1 3 5 17 7 15 37 18 19 37 11 4 4 4 5 4 4 4 1 6 6 6 5 19 19 17 2 15 18 18 2 5 4 7 2 18 3 3 3 5 11 4 7 4 9 5 3 9 8 6 9 15 1 15 15 1 14 14 2 14 14 11 3 12 16 16 16 5 1 3 1 17 17 2 1 1 95 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 | // SPDX-License-Identifier: GPL-2.0 /* * Provide access to virtual console memory. * /dev/vcs: the screen as it is being viewed right now (possibly scrolled) * /dev/vcsN: the screen of /dev/ttyN (1 <= N <= 63) * [minor: N] * * /dev/vcsaN: idem, but including attributes, and prefixed with * the 4 bytes lines,columns,x,y (as screendump used to give). * Attribute/character pair is in native endianity. * [minor: N+128] * * /dev/vcsuN: similar to /dev/vcsaN but using 4-byte unicode values * instead of 1-byte screen glyph values. * [minor: N+64] * * /dev/vcsuaN: same idea as /dev/vcsaN for unicode (not yet implemented). * * This replaces screendump and part of selection, so that the system * administrator can control access using file system permissions. * * aeb@cwi.nl - efter Friedas begravelse - 950211 * * machek@k332.feld.cvut.cz - modified not to send characters to wrong console * - fixed some fatal off-by-one bugs (0-- no longer == -1 -> looping and looping and looping...) * - making it shorter - scr_readw are macros which expand in PRETTY long code */ #include <linux/kernel.h> #include <linux/major.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/tty.h> #include <linux/interrupt.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/vt_kern.h> #include <linux/selection.h> #include <linux/kbd_kern.h> #include <linux/console.h> #include <linux/device.h> #include <linux/sched.h> #include <linux/fs.h> #include <linux/poll.h> #include <linux/signal.h> #include <linux/slab.h> #include <linux/notifier.h> #include <linux/uaccess.h> #include <asm/byteorder.h> #include <asm/unaligned.h> #define HEADER_SIZE 4u #define CON_BUF_SIZE (CONFIG_BASE_SMALL ? 256 : PAGE_SIZE) /* * Our minor space: * * 0 ... 63 glyph mode without attributes * 64 ... 127 unicode mode without attributes * 128 ... 191 glyph mode with attributes * 192 ... 255 unused (reserved for unicode with attributes) * * This relies on MAX_NR_CONSOLES being <= 63, meaning 63 actual consoles * with minors 0, 64, 128 and 192 being proxies for the foreground console. */ #if MAX_NR_CONSOLES > 63 #warning "/dev/vcs* devices may not accommodate more than 63 consoles" #endif #define console(inode) (iminor(inode) & 63) #define use_unicode(inode) (iminor(inode) & 64) #define use_attributes(inode) (iminor(inode) & 128) struct vcs_poll_data { struct notifier_block notifier; unsigned int cons_num; int event; wait_queue_head_t waitq; struct fasync_struct *fasync; }; static int vcs_notifier(struct notifier_block *nb, unsigned long code, void *_param) { struct vt_notifier_param *param = _param; struct vc_data *vc = param->vc; struct vcs_poll_data *poll = container_of(nb, struct vcs_poll_data, notifier); int currcons = poll->cons_num; int fa_band; switch (code) { case VT_UPDATE: fa_band = POLL_PRI; break; case VT_DEALLOCATE: fa_band = POLL_HUP; break; default: return NOTIFY_DONE; } if (currcons == 0) currcons = fg_console; else currcons--; if (currcons != vc->vc_num) return NOTIFY_DONE; poll->event = code; wake_up_interruptible(&poll->waitq); kill_fasync(&poll->fasync, SIGIO, fa_band); return NOTIFY_OK; } static void vcs_poll_data_free(struct vcs_poll_data *poll) { unregister_vt_notifier(&poll->notifier); kfree(poll); } static struct vcs_poll_data * vcs_poll_data_get(struct file *file) { struct vcs_poll_data *poll = file->private_data, *kill = NULL; if (poll) return poll; poll = kzalloc(sizeof(*poll), GFP_KERNEL); if (!poll) return NULL; poll->cons_num = console(file_inode(file)); init_waitqueue_head(&poll->waitq); poll->notifier.notifier_call = vcs_notifier; /* * In order not to lose any update event, we must pretend one might * have occurred before we have a chance to register our notifier. * This is also how user space has come to detect which kernels * support POLLPRI on /dev/vcs* devices i.e. using poll() with * POLLPRI and a zero timeout. */ poll->event = VT_UPDATE; if (register_vt_notifier(&poll->notifier) != 0) { kfree(poll); return NULL; } /* * This code may be called either through ->poll() or ->fasync(). * If we have two threads using the same file descriptor, they could * both enter this function, both notice that the structure hasn't * been allocated yet and go ahead allocating it in parallel, but * only one of them must survive and be shared otherwise we'd leak * memory with a dangling notifier callback. */ spin_lock(&file->f_lock); if (!file->private_data) { file->private_data = poll; } else { /* someone else raced ahead of us */ kill = poll; poll = file->private_data; } spin_unlock(&file->f_lock); if (kill) vcs_poll_data_free(kill); return poll; } /** * vcs_vc -- return VC for @inode * @inode: inode for which to return a VC * @viewed: returns whether this console is currently foreground (viewed) * * Must be called with console_lock. */ static struct vc_data *vcs_vc(struct inode *inode, bool *viewed) { unsigned int currcons = console(inode); WARN_CONSOLE_UNLOCKED(); if (currcons == 0) { currcons = fg_console; if (viewed) *viewed = true; } else { currcons--; if (viewed) *viewed = false; } return vc_cons[currcons].d; } /** * vcs_size -- return size for a VC in @vc * @vc: which VC * @attr: does it use attributes? * @unicode: is it unicode? * * Must be called with console_lock. */ static int vcs_size(const struct vc_data *vc, bool attr, bool unicode) { int size; WARN_CONSOLE_UNLOCKED(); size = vc->vc_rows * vc->vc_cols; if (attr) { if (unicode) return -EOPNOTSUPP; size = 2 * size + HEADER_SIZE; } else if (unicode) size *= 4; return size; } static loff_t vcs_lseek(struct file *file, loff_t offset, int orig) { struct inode *inode = file_inode(file); struct vc_data *vc; int size; console_lock(); vc = vcs_vc(inode, NULL); if (!vc) { console_unlock(); return -ENXIO; } size = vcs_size(vc, use_attributes(inode), use_unicode(inode)); console_unlock(); if (size < 0) return size; return fixed_size_llseek(file, offset, orig, size); } static int vcs_read_buf_uni(struct vc_data *vc, char *con_buf, unsigned int pos, unsigned int count, bool viewed) { unsigned int nr, row, col, maxcol = vc->vc_cols; int ret; ret = vc_uniscr_check(vc); if (ret) return ret; pos /= 4; row = pos / maxcol; col = pos % maxcol; nr = maxcol - col; do { if (nr > count / 4) nr = count / 4; vc_uniscr_copy_line(vc, con_buf, viewed, row, col, nr); con_buf += nr * 4; count -= nr * 4; row++; col = 0; nr = maxcol; } while (count); return 0; } static void vcs_read_buf_noattr(const struct vc_data *vc, char *con_buf, unsigned int pos, unsigned int count, bool viewed) { u16 *org; unsigned int col, maxcol = vc->vc_cols; org = screen_pos(vc, pos, viewed); col = pos % maxcol; pos += maxcol - col; while (count-- > 0) { *con_buf++ = (vcs_scr_readw(vc, org++) & 0xff); if (++col == maxcol) { org = screen_pos(vc, pos, viewed); col = 0; pos += maxcol; } } } static unsigned int vcs_read_buf(const struct vc_data *vc, char *con_buf, unsigned int pos, unsigned int count, bool viewed, unsigned int *skip) { u16 *org, *con_buf16; unsigned int col, maxcol = vc->vc_cols; unsigned int filled = count; if (pos < HEADER_SIZE) { /* clamp header values if they don't fit */ con_buf[0] = min(vc->vc_rows, 0xFFu); con_buf[1] = min(vc->vc_cols, 0xFFu); getconsxy(vc, con_buf + 2); *skip += pos; count += pos; if (count > CON_BUF_SIZE) { count = CON_BUF_SIZE; filled = count - pos; } /* Advance state pointers and move on. */ count -= min(HEADER_SIZE, count); pos = HEADER_SIZE; con_buf += HEADER_SIZE; /* If count >= 0, then pos is even... */ } else if (pos & 1) { /* * Skip first byte for output if start address is odd. Update * region sizes up/down depending on free space in buffer. */ (*skip)++; if (count < CON_BUF_SIZE) count++; else filled--; } if (!count) return filled; pos -= HEADER_SIZE; pos /= 2; col = pos % maxcol; org = screen_pos(vc, pos, viewed); pos += maxcol - col; /* * Buffer has even length, so we can always copy character + attribute. * We do not copy last byte to userspace if count is odd. */ count = (count + 1) / 2; con_buf16 = (u16 *)con_buf; while (count) { *con_buf16++ = vcs_scr_readw(vc, org++); count--; if (++col == maxcol) { org = screen_pos(vc, pos, viewed); col = 0; pos += maxcol; } } return filled; } static ssize_t vcs_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct inode *inode = file_inode(file); struct vc_data *vc; struct vcs_poll_data *poll; unsigned int read; ssize_t ret; char *con_buf; loff_t pos; bool viewed, attr, uni_mode; con_buf = (char *) __get_free_page(GFP_KERNEL); if (!con_buf) return -ENOMEM; pos = *ppos; /* Select the proper current console and verify * sanity of the situation under the console lock. */ console_lock(); uni_mode = use_unicode(inode); attr = use_attributes(inode); ret = -EINVAL; if (pos < 0) goto unlock_out; /* we enforce 32-bit alignment for pos and count in unicode mode */ if (uni_mode && (pos | count) & 3) goto unlock_out; poll = file->private_data; if (count && poll) poll->event = 0; read = 0; ret = 0; while (count) { unsigned int this_round, skip = 0; int size; vc = vcs_vc(inode, &viewed); if (!vc) { ret = -ENXIO; break; } /* Check whether we are above size each round, * as copy_to_user at the end of this loop * could sleep. */ size = vcs_size(vc, attr, uni_mode); if (size < 0) { ret = size; break; } if (pos >= size) break; if (count > size - pos) count = size - pos; this_round = count; if (this_round > CON_BUF_SIZE) this_round = CON_BUF_SIZE; /* Perform the whole read into the local con_buf. * Then we can drop the console spinlock and safely * attempt to move it to userspace. */ if (uni_mode) { ret = vcs_read_buf_uni(vc, con_buf, pos, this_round, viewed); if (ret) break; } else if (!attr) { vcs_read_buf_noattr(vc, con_buf, pos, this_round, viewed); } else { this_round = vcs_read_buf(vc, con_buf, pos, this_round, viewed, &skip); } /* Finally, release the console semaphore while we push * all the data to userspace from our temporary buffer. * * AKPM: Even though it's a semaphore, we should drop it because * the pagefault handling code may want to call printk(). */ console_unlock(); ret = copy_to_user(buf, con_buf + skip, this_round); console_lock(); if (ret) { read += this_round - ret; ret = -EFAULT; break; } buf += this_round; pos += this_round; read += this_round; count -= this_round; } *ppos += read; if (read) ret = read; unlock_out: console_unlock(); free_page((unsigned long) con_buf); return ret; } static u16 *vcs_write_buf_noattr(struct vc_data *vc, const char *con_buf, unsigned int pos, unsigned int count, bool viewed, u16 **org0) { u16 *org; unsigned int col, maxcol = vc->vc_cols; *org0 = org = screen_pos(vc, pos, viewed); col = pos % maxcol; pos += maxcol - col; while (count > 0) { unsigned char c = *con_buf++; count--; vcs_scr_writew(vc, (vcs_scr_readw(vc, org) & 0xff00) | c, org); org++; if (++col == maxcol) { org = screen_pos(vc, pos, viewed); col = 0; pos += maxcol; } } return org; } /* * Compilers (gcc 10) are unable to optimize the swap in cpu_to_le16. So do it * the poor man way. */ static inline u16 vc_compile_le16(u8 hi, u8 lo) { #ifdef __BIG_ENDIAN return (lo << 8u) | hi; #else return (hi << 8u) | lo; #endif } static u16 *vcs_write_buf(struct vc_data *vc, const char *con_buf, unsigned int pos, unsigned int count, bool viewed, u16 **org0) { u16 *org; unsigned int col, maxcol = vc->vc_cols; unsigned char c; /* header */ if (pos < HEADER_SIZE) { char header[HEADER_SIZE]; getconsxy(vc, header + 2); while (pos < HEADER_SIZE && count > 0) { count--; header[pos++] = *con_buf++; } if (!viewed) putconsxy(vc, header + 2); } if (!count) return NULL; pos -= HEADER_SIZE; col = (pos/2) % maxcol; *org0 = org = screen_pos(vc, pos/2, viewed); /* odd pos -- the first single character */ if (pos & 1) { count--; c = *con_buf++; vcs_scr_writew(vc, vc_compile_le16(c, vcs_scr_readw(vc, org)), org); org++; pos++; if (++col == maxcol) { org = screen_pos(vc, pos/2, viewed); col = 0; } } pos /= 2; pos += maxcol - col; /* even pos -- handle attr+character pairs */ while (count > 1) { unsigned short w; w = get_unaligned(((unsigned short *)con_buf)); vcs_scr_writew(vc, w, org++); con_buf += 2; count -= 2; if (++col == maxcol) { org = screen_pos(vc, pos, viewed); col = 0; pos += maxcol; } } if (!count) return org; /* odd pos -- the remaining character */ c = *con_buf++; vcs_scr_writew(vc, vc_compile_le16(vcs_scr_readw(vc, org) >> 8, c), org); return org; } static ssize_t vcs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct inode *inode = file_inode(file); struct vc_data *vc; char *con_buf; u16 *org0, *org; unsigned int written; int size; ssize_t ret; loff_t pos; bool viewed, attr; if (use_unicode(inode)) return -EOPNOTSUPP; con_buf = (char *) __get_free_page(GFP_KERNEL); if (!con_buf) return -ENOMEM; pos = *ppos; /* Select the proper current console and verify * sanity of the situation under the console lock. */ console_lock(); attr = use_attributes(inode); ret = -ENXIO; vc = vcs_vc(inode, &viewed); if (!vc) goto unlock_out; size = vcs_size(vc, attr, false); if (size < 0) { ret = size; goto unlock_out; } ret = -EINVAL; if (pos < 0 || pos > size) goto unlock_out; if (count > size - pos) count = size - pos; written = 0; while (count) { unsigned int this_round = count; if (this_round > CON_BUF_SIZE) this_round = CON_BUF_SIZE; /* Temporarily drop the console lock so that we can read * in the write data from userspace safely. */ console_unlock(); ret = copy_from_user(con_buf, buf, this_round); console_lock(); if (ret) { this_round -= ret; if (!this_round) { /* Abort loop if no data were copied. Otherwise * fail with -EFAULT. */ if (written) break; ret = -EFAULT; goto unlock_out; } } /* The vc might have been freed or vcs_size might have changed * while we slept to grab the user buffer, so recheck. * Return data written up to now on failure. */ vc = vcs_vc(inode, &viewed); if (!vc) { if (written) break; ret = -ENXIO; goto unlock_out; } size = vcs_size(vc, attr, false); if (size < 0) { if (written) break; ret = size; goto unlock_out; } if (pos >= size) break; if (this_round > size - pos) this_round = size - pos; /* OK, now actually push the write to the console * under the lock using the local kernel buffer. */ if (attr) org = vcs_write_buf(vc, con_buf, pos, this_round, viewed, &org0); else org = vcs_write_buf_noattr(vc, con_buf, pos, this_round, viewed, &org0); count -= this_round; written += this_round; buf += this_round; pos += this_round; if (org) update_region(vc, (unsigned long)(org0), org - org0); } *ppos += written; ret = written; if (written) vcs_scr_updated(vc); unlock_out: console_unlock(); free_page((unsigned long) con_buf); return ret; } static __poll_t vcs_poll(struct file *file, poll_table *wait) { struct vcs_poll_data *poll = vcs_poll_data_get(file); __poll_t ret = DEFAULT_POLLMASK|EPOLLERR; if (poll) { poll_wait(file, &poll->waitq, wait); switch (poll->event) { case VT_UPDATE: ret = DEFAULT_POLLMASK|EPOLLPRI; break; case VT_DEALLOCATE: ret = DEFAULT_POLLMASK|EPOLLHUP|EPOLLERR; break; case 0: ret = DEFAULT_POLLMASK; break; } } return ret; } static int vcs_fasync(int fd, struct file *file, int on) { struct vcs_poll_data *poll = file->private_data; if (!poll) { /* don't allocate anything if all we want is disable fasync */ if (!on) return 0; poll = vcs_poll_data_get(file); if (!poll) return -ENOMEM; } return fasync_helper(fd, file, on, &poll->fasync); } static int vcs_open(struct inode *inode, struct file *filp) { unsigned int currcons = console(inode); bool attr = use_attributes(inode); bool uni_mode = use_unicode(inode); int ret = 0; /* we currently don't support attributes in unicode mode */ if (attr && uni_mode) return -EOPNOTSUPP; console_lock(); if(currcons && !vc_cons_allocated(currcons-1)) ret = -ENXIO; console_unlock(); return ret; } static int vcs_release(struct inode *inode, struct file *file) { struct vcs_poll_data *poll = file->private_data; if (poll) vcs_poll_data_free(poll); return 0; } static const struct file_operations vcs_fops = { .llseek = vcs_lseek, .read = vcs_read, .write = vcs_write, .poll = vcs_poll, .fasync = vcs_fasync, .open = vcs_open, .release = vcs_release, }; static struct class *vc_class; void vcs_make_sysfs(int index) { device_create(vc_class, NULL, MKDEV(VCS_MAJOR, index + 1), NULL, "vcs%u", index + 1); device_create(vc_class, NULL, MKDEV(VCS_MAJOR, index + 65), NULL, "vcsu%u", index + 1); device_create(vc_class, NULL, MKDEV(VCS_MAJOR, index + 129), NULL, "vcsa%u", index + 1); } void vcs_remove_sysfs(int index) { device_destroy(vc_class, MKDEV(VCS_MAJOR, index + 1)); device_destroy(vc_class, MKDEV(VCS_MAJOR, index + 65)); device_destroy(vc_class, MKDEV(VCS_MAJOR, index + 129)); } int __init vcs_init(void) { unsigned int i; if (register_chrdev(VCS_MAJOR, "vcs", &vcs_fops)) panic("unable to get major %d for vcs device", VCS_MAJOR); vc_class = class_create(THIS_MODULE, "vc"); device_create(vc_class, NULL, MKDEV(VCS_MAJOR, 0), NULL, "vcs"); device_create(vc_class, NULL, MKDEV(VCS_MAJOR, 64), NULL, "vcsu"); device_create(vc_class, NULL, MKDEV(VCS_MAJOR, 128), NULL, "vcsa"); for (i = 0; i < MIN_NR_CONSOLES; i++) vcs_make_sysfs(i); return 0; } |
17 6 15 14 1 10 12 9 13 12 1 10 10 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 | // SPDX-License-Identifier: GPL-2.0-only /* * This is a module which is used for setting the MSS option in TCP packets. * * Copyright (C) 2000 Marc Boucher <marc@mbsi.ca> * Copyright (C) 2007 Patrick McHardy <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/gfp.h> #include <linux/ipv6.h> #include <linux/tcp.h> #include <net/dst.h> #include <net/flow.h> #include <net/ipv6.h> #include <net/route.h> #include <net/tcp.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_tcpudp.h> #include <linux/netfilter/xt_TCPMSS.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); MODULE_DESCRIPTION("Xtables: TCP Maximum Segment Size (MSS) adjustment"); MODULE_ALIAS("ipt_TCPMSS"); MODULE_ALIAS("ip6t_TCPMSS"); static inline unsigned int optlen(const u_int8_t *opt, unsigned int offset) { /* Beware zero-length options: make finite progress */ if (opt[offset] <= TCPOPT_NOP || opt[offset+1] == 0) return 1; else return opt[offset+1]; } static u_int32_t tcpmss_reverse_mtu(struct net *net, const struct sk_buff *skb, unsigned int family) { struct flowi fl; struct rtable *rt = NULL; u_int32_t mtu = ~0U; if (family == PF_INET) { struct flowi4 *fl4 = &fl.u.ip4; memset(fl4, 0, sizeof(*fl4)); fl4->daddr = ip_hdr(skb)->saddr; } else { struct flowi6 *fl6 = &fl.u.ip6; memset(fl6, 0, sizeof(*fl6)); fl6->daddr = ipv6_hdr(skb)->saddr; } nf_route(net, (struct dst_entry **)&rt, &fl, false, family); if (rt != NULL) { mtu = dst_mtu(&rt->dst); dst_release(&rt->dst); } return mtu; } static int tcpmss_mangle_packet(struct sk_buff *skb, const struct xt_action_param *par, unsigned int family, unsigned int tcphoff, unsigned int minlen) { const struct xt_tcpmss_info *info = par->targinfo; struct tcphdr *tcph; int len, tcp_hdrlen; unsigned int i; __be16 oldval; u16 newmss; u8 *opt; /* This is a fragment, no TCP header is available */ if (par->fragoff != 0) return 0; if (skb_ensure_writable(skb, skb->len)) return -1; len = skb->len - tcphoff; if (len < (int)sizeof(struct tcphdr)) return -1; tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); tcp_hdrlen = tcph->doff * 4; if (len < tcp_hdrlen || tcp_hdrlen < sizeof(struct tcphdr)) return -1; if (info->mss == XT_TCPMSS_CLAMP_PMTU) { struct net *net = xt_net(par); unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family); unsigned int min_mtu = min(dst_mtu(skb_dst(skb)), in_mtu); if (min_mtu <= minlen) { net_err_ratelimited("unknown or invalid path-MTU (%u)\n", min_mtu); return -1; } newmss = min_mtu - minlen; } else newmss = info->mss; opt = (u_int8_t *)tcph; for (i = sizeof(struct tcphdr); i <= tcp_hdrlen - TCPOLEN_MSS; i += optlen(opt, i)) { if (opt[i] == TCPOPT_MSS && opt[i+1] == TCPOLEN_MSS) { u_int16_t oldmss; oldmss = (opt[i+2] << 8) | opt[i+3]; /* Never increase MSS, even when setting it, as * doing so results in problems for hosts that rely * on MSS being set correctly. */ if (oldmss <= newmss) return 0; opt[i+2] = (newmss & 0xff00) >> 8; opt[i+3] = newmss & 0x00ff; inet_proto_csum_replace2(&tcph->check, skb, htons(oldmss), htons(newmss), false); return 0; } } /* There is data after the header so the option can't be added * without moving it, and doing so may make the SYN packet * itself too large. Accept the packet unmodified instead. */ if (len > tcp_hdrlen) return 0; /* tcph->doff has 4 bits, do not wrap it to 0 */ if (tcp_hdrlen >= 15 * 4) return 0; /* * MSS Option not found ?! add it.. */ if (skb_tailroom(skb) < TCPOLEN_MSS) { if (pskb_expand_head(skb, 0, TCPOLEN_MSS - skb_tailroom(skb), GFP_ATOMIC)) return -1; tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); } skb_put(skb, TCPOLEN_MSS); /* * IPv4: RFC 1122 states "If an MSS option is not received at * connection setup, TCP MUST assume a default send MSS of 536". * IPv6: RFC 2460 states IPv6 has a minimum MTU of 1280 and a minimum * length IPv6 header of 60, ergo the default MSS value is 1220 * Since no MSS was provided, we must use the default values */ if (xt_family(par) == NFPROTO_IPV4) newmss = min(newmss, (u16)536); else newmss = min(newmss, (u16)1220); opt = (u_int8_t *)tcph + sizeof(struct tcphdr); memmove(opt + TCPOLEN_MSS, opt, len - sizeof(struct tcphdr)); inet_proto_csum_replace2(&tcph->check, skb, htons(len), htons(len + TCPOLEN_MSS), true); opt[0] = TCPOPT_MSS; opt[1] = TCPOLEN_MSS; opt[2] = (newmss & 0xff00) >> 8; opt[3] = newmss & 0x00ff; inet_proto_csum_replace4(&tcph->check, skb, 0, *((__be32 *)opt), false); oldval = ((__be16 *)tcph)[6]; tcph->doff += TCPOLEN_MSS/4; inet_proto_csum_replace2(&tcph->check, skb, oldval, ((__be16 *)tcph)[6], false); return TCPOLEN_MSS; } static unsigned int tcpmss_tg4(struct sk_buff *skb, const struct xt_action_param *par) { struct iphdr *iph = ip_hdr(skb); __be16 newlen; int ret; ret = tcpmss_mangle_packet(skb, par, PF_INET, iph->ihl * 4, sizeof(*iph) + sizeof(struct tcphdr)); if (ret < 0) return NF_DROP; if (ret > 0) { iph = ip_hdr(skb); newlen = htons(ntohs(iph->tot_len) + ret); csum_replace2(&iph->check, iph->tot_len, newlen); iph->tot_len = newlen; } return XT_CONTINUE; } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static unsigned int tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par) { struct ipv6hdr *ipv6h = ipv6_hdr(skb); u8 nexthdr; __be16 frag_off, oldlen, newlen; int tcphoff; int ret; nexthdr = ipv6h->nexthdr; tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr, &frag_off); if (tcphoff < 0) return NF_DROP; ret = tcpmss_mangle_packet(skb, par, PF_INET6, tcphoff, sizeof(*ipv6h) + sizeof(struct tcphdr)); if (ret < 0) return NF_DROP; if (ret > 0) { ipv6h = ipv6_hdr(skb); oldlen = ipv6h->payload_len; newlen = htons(ntohs(oldlen) + ret); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)oldlen), (__force __wsum)newlen); ipv6h->payload_len = newlen; } return XT_CONTINUE; } #endif /* Must specify -p tcp --syn */ static inline bool find_syn_match(const struct xt_entry_match *m) { const struct xt_tcp *tcpinfo = (const struct xt_tcp *)m->data; if (strcmp(m->u.kernel.match->name, "tcp") == 0 && tcpinfo->flg_cmp & TCPHDR_SYN && !(tcpinfo->invflags & XT_TCP_INV_FLAGS)) return true; return false; } static int tcpmss_tg4_check(const struct xt_tgchk_param *par) { const struct xt_tcpmss_info *info = par->targinfo; const struct ipt_entry *e = par->entryinfo; const struct xt_entry_match *ematch; if (info->mss == XT_TCPMSS_CLAMP_PMTU && (par->hook_mask & ~((1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING))) != 0) { pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); return -EINVAL; } if (par->nft_compat) return 0; xt_ematch_foreach(ematch, e) if (find_syn_match(ematch)) return 0; pr_info_ratelimited("Only works on TCP SYN packets\n"); return -EINVAL; } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static int tcpmss_tg6_check(const struct xt_tgchk_param *par) { const struct xt_tcpmss_info *info = par->targinfo; const struct ip6t_entry *e = par->entryinfo; const struct xt_entry_match *ematch; if (info->mss == XT_TCPMSS_CLAMP_PMTU && (par->hook_mask & ~((1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING))) != 0) { pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); return -EINVAL; } if (par->nft_compat) return 0; xt_ematch_foreach(ematch, e) if (find_syn_match(ematch)) return 0; pr_info_ratelimited("Only works on TCP SYN packets\n"); return -EINVAL; } #endif static struct xt_target tcpmss_tg_reg[] __read_mostly = { { .family = NFPROTO_IPV4, .name = "TCPMSS", .checkentry = tcpmss_tg4_check, .target = tcpmss_tg4, .targetsize = sizeof(struct xt_tcpmss_info), .proto = IPPROTO_TCP, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .family = NFPROTO_IPV6, .name = "TCPMSS", .checkentry = tcpmss_tg6_check, .target = tcpmss_tg6, .targetsize = sizeof(struct xt_tcpmss_info), .proto = IPPROTO_TCP, .me = THIS_MODULE, }, #endif }; static int __init tcpmss_tg_init(void) { return xt_register_targets(tcpmss_tg_reg, ARRAY_SIZE(tcpmss_tg_reg)); } static void __exit tcpmss_tg_exit(void) { xt_unregister_targets(tcpmss_tg_reg, ARRAY_SIZE(tcpmss_tg_reg)); } module_init(tcpmss_tg_init); module_exit(tcpmss_tg_exit); |
8 4 6 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 | /* * linux/fs/nls/nls_cp737.c * * Charset cp737 translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, 0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f, 0x03a0, /* 0x90*/ 0x03a1, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7, 0x03a8, 0x03a9, 0x03b1, 0x03b2, 0x03b3, 0x03b4, 0x03b5, 0x03b6, 0x03b7, 0x03b8, /* 0xa0*/ 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf, 0x03c0, 0x03c1, 0x03c3, 0x03c2, 0x03c4, 0x03c5, 0x03c6, 0x03c7, 0x03c8, /* 0xb0*/ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510, /* 0xc0*/ 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f, 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567, /* 0xd0*/ 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b, 0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580, /* 0xe0*/ 0x03c9, 0x03ac, 0x03ad, 0x03ae, 0x03ca, 0x03af, 0x03cc, 0x03cd, 0x03cb, 0x03ce, 0x0386, 0x0388, 0x0389, 0x038a, 0x038c, 0x038e, /* 0xf0*/ 0x038f, 0x00b1, 0x2265, 0x2264, 0x03aa, 0x03ab, 0x00f7, 0x2248, 0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0x00a0, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0xf8, 0xf1, 0xfd, 0x00, 0x00, 0x00, 0x00, 0xfa, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, /* 0xf0-0xf7 */ }; static const unsigned char page03[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xea, 0x00, /* 0x80-0x87 */ 0xeb, 0xec, 0xed, 0x00, 0xee, 0x00, 0xef, 0xf0, /* 0x88-0x8f */ 0x00, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, /* 0x90-0x97 */ 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, /* 0x98-0x9f */ 0x8f, 0x90, 0x00, 0x91, 0x92, 0x93, 0x94, 0x95, /* 0xa0-0xa7 */ 0x96, 0x97, 0xf4, 0xf5, 0xe1, 0xe2, 0xe3, 0xe5, /* 0xa8-0xaf */ 0x00, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, /* 0xb0-0xb7 */ 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, /* 0xb8-0xbf */ 0xa7, 0xa8, 0xaa, 0xa9, 0xab, 0xac, 0xad, 0xae, /* 0xc0-0xc7 */ 0xaf, 0xe0, 0xe4, 0xe8, 0xe6, 0xe7, 0xe9, 0x00, /* 0xc8-0xcf */ }; static const unsigned char page20[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */ }; static const unsigned char page22[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */ }; static const unsigned char page25[256] = { 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */ 0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */ 0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */ 0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ }; static const unsigned char *const page_uni2charset[256] = { page00, NULL, NULL, page03, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page20, NULL, page22, NULL, NULL, page25, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x80-0x87 */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0x88-0x8f */ 0xa8, 0xa9, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xe0, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xe1, 0xe2, 0xe3, 0xe5, 0xe6, 0xe7, /* 0xe8-0xef */ 0xe9, 0xf1, 0xf2, 0xf3, 0xe4, 0xe8, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x98-0x9f */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0xa0-0xa7 */ 0x90, 0x91, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0x97, 0xea, 0xeb, 0xec, 0xf4, 0xed, 0xee, 0xef, /* 0xe0-0xe7 */ 0xf5, 0xf0, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "cp737", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_cp737(void) { return register_nls(&table); } static void __exit exit_nls_cp737(void) { unregister_nls(&table); } module_init(init_nls_cp737) module_exit(exit_nls_cp737) MODULE_LICENSE("Dual BSD/GPL"); |
1255 1257 1167 1166 8 1166 1165 1170 1170 1173 1168 1167 1166 1166 371 371 1149 1149 1149 1148 1149 179 179 179 433 433 1145 1145 1145 1143 1144 406 168 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 | // SPDX-License-Identifier: GPL-2.0 /* * Interface between ext4 and JBD */ #include "ext4_jbd2.h" #include <trace/events/ext4.h> int ext4_inode_journal_mode(struct inode *inode) { if (EXT4_JOURNAL(inode) == NULL) return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ /* We do not support data journalling with delayed allocation */ if (!S_ISREG(inode->i_mode) || ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && !test_opt(inode->i_sb, DELALLOC))) { /* We do not support data journalling for encrypted data */ if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ } if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ BUG(); } /* Just increment the non-pointer handle value */ static handle_t *ext4_get_nojournal(void) { handle_t *handle = current->journal_info; unsigned long ref_cnt = (unsigned long)handle; BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT); ref_cnt++; handle = (handle_t *)ref_cnt; current->journal_info = handle; return handle; } /* Decrement the non-pointer handle value */ static void ext4_put_nojournal(handle_t *handle) { unsigned long ref_cnt = (unsigned long)handle; BUG_ON(ref_cnt == 0); ref_cnt--; handle = (handle_t *)ref_cnt; current->journal_info = handle; } /* * Wrappers for jbd2_journal_start/end. */ static int ext4_journal_check_start(struct super_block *sb) { journal_t *journal; might_sleep(); if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) return -EIO; if (sb_rdonly(sb)) return -EROFS; WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); journal = EXT4_SB(sb)->s_journal; /* * Special case here: if the journal has aborted behind our * backs (eg. EIO in the commit thread), then we still need to * take the FS itself readonly cleanly. */ if (journal && is_journal_aborted(journal)) { ext4_abort(sb, -journal->j_errno, "Detected aborted journal"); return -EROFS; } return 0; } handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, int type, int blocks, int rsv_blocks, int revoke_creds) { journal_t *journal; int err; trace_ext4_journal_start(sb, blocks, rsv_blocks, revoke_creds, _RET_IP_); err = ext4_journal_check_start(sb); if (err < 0) return ERR_PTR(err); journal = EXT4_SB(sb)->s_journal; if (!journal || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) return ext4_get_nojournal(); return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds, GFP_NOFS, type, line); } int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) { struct super_block *sb; int err; int rc; if (!ext4_handle_valid(handle)) { ext4_put_nojournal(handle); return 0; } err = handle->h_err; if (!handle->h_transaction) { rc = jbd2_journal_stop(handle); return err ? err : rc; } sb = handle->h_transaction->t_journal->j_private; rc = jbd2_journal_stop(handle); if (!err) err = rc; if (err) __ext4_std_error(sb, where, line, err); return err; } handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, int type) { struct super_block *sb; int err; if (!ext4_handle_valid(handle)) return ext4_get_nojournal(); sb = handle->h_journal->j_private; trace_ext4_journal_start_reserved(sb, jbd2_handle_buffer_credits(handle), _RET_IP_); err = ext4_journal_check_start(sb); if (err < 0) { jbd2_journal_free_reserved(handle); return ERR_PTR(err); } err = jbd2_journal_start_reserved(handle, type, line); if (err < 0) return ERR_PTR(err); return handle; } int __ext4_journal_ensure_credits(handle_t *handle, int check_cred, int extend_cred, int revoke_cred) { if (!ext4_handle_valid(handle)) return 0; if (is_handle_aborted(handle)) return -EROFS; if (jbd2_handle_buffer_credits(handle) >= check_cred && handle->h_revoke_credits >= revoke_cred) return 0; extend_cred = max(0, extend_cred - jbd2_handle_buffer_credits(handle)); revoke_cred = max(0, revoke_cred - handle->h_revoke_credits); return ext4_journal_extend(handle, extend_cred, revoke_cred); } static void ext4_journal_abort_handle(const char *caller, unsigned int line, const char *err_fn, struct buffer_head *bh, handle_t *handle, int err) { char nbuf[16]; const char *errstr = ext4_decode_error(NULL, err, nbuf); BUG_ON(!ext4_handle_valid(handle)); if (bh) BUFFER_TRACE(bh, "abort"); if (!handle->h_err) handle->h_err = err; if (is_handle_aborted(handle)) return; printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n", caller, line, errstr, err_fn); jbd2_journal_abort_handle(handle); } static void ext4_check_bdev_write_error(struct super_block *sb) { struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; struct ext4_sb_info *sbi = EXT4_SB(sb); int err; /* * If the block device has write error flag, it may have failed to * async write out metadata buffers in the background. In this case, * we could read old data from disk and write it out again, which * may lead to on-disk filesystem inconsistency. */ if (errseq_check(&mapping->wb_err, READ_ONCE(sbi->s_bdev_wb_err))) { spin_lock(&sbi->s_bdev_wb_lock); err = errseq_check_and_advance(&mapping->wb_err, &sbi->s_bdev_wb_err); spin_unlock(&sbi->s_bdev_wb_lock); if (err) ext4_error_err(sb, -err, "Error while async write back metadata"); } } int __ext4_journal_get_write_access(const char *where, unsigned int line, handle_t *handle, struct super_block *sb, struct buffer_head *bh, enum ext4_journal_trigger_type trigger_type) { int err; might_sleep(); if (bh->b_bdev->bd_super) ext4_check_bdev_write_error(bh->b_bdev->bd_super); if (ext4_handle_valid(handle)) { err = jbd2_journal_get_write_access(handle, bh); if (err) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); return err; } } if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) return 0; BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); jbd2_journal_set_triggers(bh, &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers); return 0; } /* * The ext4 forget function must perform a revoke if we are freeing data * which has been journaled. Metadata (eg. indirect blocks) must be * revoked in all cases. * * "bh" may be NULL: a metadata block may have been freed from memory * but there may still be a record of it in the journal, and that record * still needs to be revoked. */ int __ext4_forget(const char *where, unsigned int line, handle_t *handle, int is_metadata, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t blocknr) { int err; might_sleep(); trace_ext4_forget(inode, is_metadata, blocknr); BUFFER_TRACE(bh, "enter"); ext4_debug("forgetting bh %p: is_metadata=%d, mode %o, data mode %x\n", bh, is_metadata, inode->i_mode, test_opt(inode->i_sb, DATA_FLAGS)); /* In the no journal case, we can just do a bforget and return */ if (!ext4_handle_valid(handle)) { bforget(bh); return 0; } /* Never use the revoke function if we are doing full data * journaling: there is no need to, and a V1 superblock won't * support it. Otherwise, only skip the revoke on un-journaled * data blocks. */ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || (!is_metadata && !ext4_should_journal_data(inode))) { if (bh) { BUFFER_TRACE(bh, "call jbd2_journal_forget"); err = jbd2_journal_forget(handle, bh); if (err) ext4_journal_abort_handle(where, line, __func__, bh, handle, err); return err; } return 0; } /* * data!=journal && (is_metadata || should_journal_data(inode)) */ BUFFER_TRACE(bh, "call jbd2_journal_revoke"); err = jbd2_journal_revoke(handle, blocknr, bh); if (err) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); __ext4_error(inode->i_sb, where, line, true, -err, 0, "error %d when attempting revoke", err); } BUFFER_TRACE(bh, "exit"); return err; } int __ext4_journal_get_create_access(const char *where, unsigned int line, handle_t *handle, struct super_block *sb, struct buffer_head *bh, enum ext4_journal_trigger_type trigger_type) { int err; if (!ext4_handle_valid(handle)) return 0; err = jbd2_journal_get_create_access(handle, bh); if (err) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); return err; } if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) return 0; BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); jbd2_journal_set_triggers(bh, &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers); return 0; } int __ext4_handle_dirty_metadata(const char *where, unsigned int line, handle_t *handle, struct inode *inode, struct buffer_head *bh) { int err = 0; might_sleep(); set_buffer_meta(bh); set_buffer_prio(bh); set_buffer_uptodate(bh); if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); /* Errors can only happen due to aborted journal or a nasty bug */ if (!is_handle_aborted(handle) && WARN_ON_ONCE(err)) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); if (inode == NULL) { pr_err("EXT4: jbd2_journal_dirty_metadata " "failed: handle type %u started at " "line %u, credits %u/%u, errcode %d", handle->h_type, handle->h_line_no, handle->h_requested_credits, jbd2_handle_buffer_credits(handle), err); return err; } ext4_error_inode(inode, where, line, bh->b_blocknr, "journal_dirty_metadata failed: " "handle type %u started at line %u, " "credits %u/%u, errcode %d", handle->h_type, handle->h_line_no, handle->h_requested_credits, jbd2_handle_buffer_credits(handle), err); } } else { if (inode) mark_buffer_dirty_inode(bh, inode); else mark_buffer_dirty(bh); if (inode && inode_needs_sync(inode)) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) { ext4_error_inode_err(inode, where, line, bh->b_blocknr, EIO, "IO error syncing itable block"); err = -EIO; } } } return err; } |
3 3 3 1 139 1 1 3 3 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 139 139 138 1 2 1 22 22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 | // SPDX-License-Identifier: GPL-2.0-or-later /* * USB Audio Driver for ALSA * * Quirks and vendor-specific extensions for mixer interfaces * * Copyright (c) 2002 by Takashi Iwai <tiwai@suse.de> * * Many codes borrowed from audio.c by * Alan Cox (alan@lxorguk.ukuu.org.uk) * Thomas Sailer (sailer@ife.ee.ethz.ch) * * Audio Advantage Micro II support added by: * Przemek Rudy (prudy1@o2.pl) */ #include <linux/hid.h> #include <linux/init.h> #include <linux/math64.h> #include <linux/slab.h> #include <linux/usb.h> #include <linux/usb/audio.h> #include <sound/asoundef.h> #include <sound/core.h> #include <sound/control.h> #include <sound/hda_verbs.h> #include <sound/hwdep.h> #include <sound/info.h> #include <sound/tlv.h> #include "usbaudio.h" #include "mixer.h" #include "mixer_quirks.h" #include "mixer_scarlett.h" #include "mixer_scarlett2.h" #include "mixer_us16x08.h" #include "mixer_s1810c.h" #include "helper.h" struct std_mono_table { unsigned int unitid, control, cmask; int val_type; const char *name; snd_kcontrol_tlv_rw_t *tlv_callback; }; /* This function allows for the creation of standard UAC controls. * See the quirks for M-Audio FTUs or Ebox-44. * If you don't want to set a TLV callback pass NULL. * * Since there doesn't seem to be a devices that needs a multichannel * version, we keep it mono for simplicity. */ static int snd_create_std_mono_ctl_offset(struct usb_mixer_interface *mixer, unsigned int unitid, unsigned int control, unsigned int cmask, int val_type, unsigned int idx_off, const char *name, snd_kcontrol_tlv_rw_t *tlv_callback) { struct usb_mixer_elem_info *cval; struct snd_kcontrol *kctl; cval = kzalloc(sizeof(*cval), GFP_KERNEL); if (!cval) return -ENOMEM; snd_usb_mixer_elem_init_std(&cval->head, mixer, unitid); cval->val_type = val_type; cval->channels = 1; cval->control = control; cval->cmask = cmask; cval->idx_off = idx_off; /* get_min_max() is called only for integer volumes later, * so provide a short-cut for booleans */ cval->min = 0; cval->max = 1; cval->res = 0; cval->dBmin = 0; cval->dBmax = 0; /* Create control */ kctl = snd_ctl_new1(snd_usb_feature_unit_ctl, cval); if (!kctl) { kfree(cval); return -ENOMEM; } /* Set name */ snprintf(kctl->id.name, sizeof(kctl->id.name), name); kctl->private_free = snd_usb_mixer_elem_free; /* set TLV */ if (tlv_callback) { kctl->tlv.c = tlv_callback; kctl->vd[0].access |= SNDRV_CTL_ELEM_ACCESS_TLV_READ | SNDRV_CTL_ELEM_ACCESS_TLV_CALLBACK; } /* Add control to mixer */ return snd_usb_mixer_add_control(&cval->head, kctl); } static int snd_create_std_mono_ctl(struct usb_mixer_interface *mixer, unsigned int unitid, unsigned int control, unsigned int cmask, int val_type, const char *name, snd_kcontrol_tlv_rw_t *tlv_callback) { return snd_create_std_mono_ctl_offset(mixer, unitid, control, cmask, val_type, 0 /* Offset */, name, tlv_callback); } /* * Create a set of standard UAC controls from a table */ static int snd_create_std_mono_table(struct usb_mixer_interface *mixer, const struct std_mono_table *t) { int err; while (t->name != NULL) { err = snd_create_std_mono_ctl(mixer, t->unitid, t->control, t->cmask, t->val_type, t->name, t->tlv_callback); if (err < 0) return err; t++; } return 0; } static int add_single_ctl_with_resume(struct usb_mixer_interface *mixer, int id, usb_mixer_elem_resume_func_t resume, const struct snd_kcontrol_new *knew, struct usb_mixer_elem_list **listp) { struct usb_mixer_elem_list *list; struct snd_kcontrol *kctl; list = kzalloc(sizeof(*list), GFP_KERNEL); if (!list) return -ENOMEM; if (listp) *listp = list; list->mixer = mixer; list->id = id; list->resume = resume; kctl = snd_ctl_new1(knew, list); if (!kctl) { kfree(list); return -ENOMEM; } kctl->private_free = snd_usb_mixer_elem_free; /* don't use snd_usb_mixer_add_control() here, this is a special list element */ return snd_usb_mixer_add_list(list, kctl, false); } /* * Sound Blaster remote control configuration * * format of remote control data: * Extigy: xx 00 * Audigy 2 NX: 06 80 xx 00 00 00 * Live! 24-bit: 06 80 xx yy 22 83 */ static const struct rc_config { u32 usb_id; u8 offset; u8 length; u8 packet_length; u8 min_packet_length; /* minimum accepted length of the URB result */ u8 mute_mixer_id; u32 mute_code; } rc_configs[] = { { USB_ID(0x041e, 0x3000), 0, 1, 2, 1, 18, 0x0013 }, /* Extigy */ { USB_ID(0x041e, 0x3020), 2, 1, 6, 6, 18, 0x0013 }, /* Audigy 2 NX */ { USB_ID(0x041e, 0x3040), 2, 2, 6, 6, 2, 0x6e91 }, /* Live! 24-bit */ { USB_ID(0x041e, 0x3042), 0, 1, 1, 1, 1, 0x000d }, /* Usb X-Fi S51 */ { USB_ID(0x041e, 0x30df), 0, 1, 1, 1, 1, 0x000d }, /* Usb X-Fi S51 Pro */ { USB_ID(0x041e, 0x3237), 0, 1, 1, 1, 1, 0x000d }, /* Usb X-Fi S51 Pro */ { USB_ID(0x041e, 0x3263), 0, 1, 1, 1, 1, 0x000d }, /* Usb X-Fi S51 Pro */ { USB_ID(0x041e, 0x3048), 2, 2, 6, 6, 2, 0x6e91 }, /* Toshiba SB0500 */ }; static void snd_usb_soundblaster_remote_complete(struct urb *urb) { struct usb_mixer_interface *mixer = urb->context; const struct rc_config *rc = mixer->rc_cfg; u32 code; if (urb->status < 0 || urb->actual_length < rc->min_packet_length) return; code = mixer->rc_buffer[rc->offset]; if (rc->length == 2) code |= mixer->rc_buffer[rc->offset + 1] << 8; /* the Mute button actually changes the mixer control */ if (code == rc->mute_code) snd_usb_mixer_notify_id(mixer, rc->mute_mixer_id); mixer->rc_code = code; wmb(); wake_up(&mixer->rc_waitq); } static long snd_usb_sbrc_hwdep_read(struct snd_hwdep *hw, char __user *buf, long count, loff_t *offset) { struct usb_mixer_interface *mixer = hw->private_data; int err; u32 rc_code; if (count != 1 && count != 4) return -EINVAL; err = wait_event_interruptible(mixer->rc_waitq, (rc_code = xchg(&mixer->rc_code, 0)) != 0); if (err == 0) { if (count == 1) err = put_user(rc_code, buf); else err = put_user(rc_code, (u32 __user *)buf); } return err < 0 ? err : count; } static __poll_t snd_usb_sbrc_hwdep_poll(struct snd_hwdep *hw, struct file *file, poll_table *wait) { struct usb_mixer_interface *mixer = hw->private_data; poll_wait(file, &mixer->rc_waitq, wait); return mixer->rc_code ? EPOLLIN | EPOLLRDNORM : 0; } static int snd_usb_soundblaster_remote_init(struct usb_mixer_interface *mixer) { struct snd_hwdep *hwdep; int err, len, i; for (i = 0; i < ARRAY_SIZE(rc_configs); ++i) if (rc_configs[i].usb_id == mixer->chip->usb_id) break; if (i >= ARRAY_SIZE(rc_configs)) return 0; mixer->rc_cfg = &rc_configs[i]; len = mixer->rc_cfg->packet_length; init_waitqueue_head(&mixer->rc_waitq); err = snd_hwdep_new(mixer->chip->card, "SB remote control", 0, &hwdep); if (err < 0) return err; snprintf(hwdep->name, sizeof(hwdep->name), "%s remote control", mixer->chip->card->shortname); hwdep->iface = SNDRV_HWDEP_IFACE_SB_RC; hwdep->private_data = mixer; hwdep->ops.read = snd_usb_sbrc_hwdep_read; hwdep->ops.poll = snd_usb_sbrc_hwdep_poll; hwdep->exclusive = 1; mixer->rc_urb = usb_alloc_urb(0, GFP_KERNEL); if (!mixer->rc_urb) return -ENOMEM; mixer->rc_setup_packet = kmalloc(sizeof(*mixer->rc_setup_packet), GFP_KERNEL); if (!mixer->rc_setup_packet) { usb_free_urb(mixer->rc_urb); mixer->rc_urb = NULL; return -ENOMEM; } mixer->rc_setup_packet->bRequestType = USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE; mixer->rc_setup_packet->bRequest = UAC_GET_MEM; mixer->rc_setup_packet->wValue = cpu_to_le16(0); mixer->rc_setup_packet->wIndex = cpu_to_le16(0); mixer->rc_setup_packet->wLength = cpu_to_le16(len); usb_fill_control_urb(mixer->rc_urb, mixer->chip->dev, usb_rcvctrlpipe(mixer->chip->dev, 0), (u8*)mixer->rc_setup_packet, mixer->rc_buffer, len, snd_usb_soundblaster_remote_complete, mixer); return 0; } #define snd_audigy2nx_led_info snd_ctl_boolean_mono_info static int snd_audigy2nx_led_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { ucontrol->value.integer.value[0] = kcontrol->private_value >> 8; return 0; } static int snd_audigy2nx_led_update(struct usb_mixer_interface *mixer, int value, int index) { struct snd_usb_audio *chip = mixer->chip; int err; err = snd_usb_lock_shutdown(chip); if (err < 0) return err; if (chip->usb_id == USB_ID(0x041e, 0x3042)) err = snd_usb_ctl_msg(chip->dev, usb_sndctrlpipe(chip->dev, 0), 0x24, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_OTHER, !value, 0, NULL, 0); /* USB X-Fi S51 Pro */ if (chip->usb_id == USB_ID(0x041e, 0x30df)) err = snd_usb_ctl_msg(chip->dev, usb_sndctrlpipe(chip->dev, 0), 0x24, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_OTHER, !value, 0, NULL, 0); else err = snd_usb_ctl_msg(chip->dev, usb_sndctrlpipe(chip->dev, 0), 0x24, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_OTHER, value, index + 2, NULL, 0); snd_usb_unlock_shutdown(chip); return err; } static int snd_audigy2nx_led_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct usb_mixer_elem_list *list = snd_kcontrol_chip(kcontrol); struct usb_mixer_interface *mixer = list->mixer; int index = kcontrol->private_value & 0xff; unsigned int value = ucontrol->value.integer.value[0]; int old_value = kcontrol->private_value >> 8; int err; if (value > 1) return -EINVAL; if (value == old_value) return 0; kcontrol->private_value = (value << 8) | index; err = snd_audigy2nx_led_update(mixer, value, index); return err < 0 ? err : 1; } static int snd_audigy2nx_led_resume(struct usb_mixer_elem_list *list) { int priv_value = list->kctl->private_value; return snd_audigy2nx_led_update(list->mixer, priv_value >> 8, priv_value & 0xff); } /* name and private_value are set dynamically */ static const struct snd_kcontrol_new snd_audigy2nx_control = { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .info = snd_audigy2nx_led_info, .get = snd_audigy2nx_led_get, .put = snd_audigy2nx_led_put, }; static const char * const snd_audigy2nx_led_names[] = { "CMSS LED Switch", "Power LED Switch", "Dolby Digital LED Switch", }; static int snd_audigy2nx_controls_create(struct usb_mixer_interface *mixer) { int i, err; for (i = 0; i < ARRAY_SIZE(snd_audigy2nx_led_names); ++i) { struct snd_kcontrol_new knew; /* USB X-Fi S51 doesn't have a CMSS LED */ if ((mixer->chip->usb_id == USB_ID(0x041e, 0x3042)) && i == 0) continue; /* USB X-Fi S51 Pro doesn't have one either */ if ((mixer->chip->usb_id == USB_ID(0x041e, 0x30df)) && i == 0) continue; if (i > 1 && /* Live24ext has 2 LEDs only */ (mixer->chip->usb_id == USB_ID(0x041e, 0x3040) || mixer->chip->usb_id == USB_ID(0x041e, 0x3042) || mixer->chip->usb_id == USB_ID(0x041e, 0x30df) || mixer->chip->usb_id == USB_ID(0x041e, 0x3048))) break; knew = snd_audigy2nx_control; knew.name = snd_audigy2nx_led_names[i]; knew.private_value = (1 << 8) | i; /* LED on as default */ err = add_single_ctl_with_resume(mixer, 0, snd_audigy2nx_led_resume, &knew, NULL); if (err < 0) return err; } return 0; } static void snd_audigy2nx_proc_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { static const struct sb_jack { int unitid; const char *name; } jacks_audigy2nx[] = { {4, "dig in "}, {7, "line in"}, {19, "spk out"}, {20, "hph out"}, {-1, NULL} }, jacks_live24ext[] = { {4, "line in"}, /* &1=Line, &2=Mic*/ {3, "hph out"}, /* headphones */ {0, "RC "}, /* last command, 6 bytes see rc_config above */ {-1, NULL} }; const struct sb_jack *jacks; struct usb_mixer_interface *mixer = entry->private_data; int i, err; u8 buf[3]; snd_iprintf(buffer, "%s jacks\n\n", mixer->chip->card->shortname); if (mixer->chip->usb_id == USB_ID(0x041e, 0x3020)) jacks = jacks_audigy2nx; else if (mixer->chip->usb_id == USB_ID(0x041e, 0x3040) || mixer->chip->usb_id == USB_ID(0x041e, 0x3048)) jacks = jacks_live24ext; else return; for (i = 0; jacks[i].name; ++i) { snd_iprintf(buffer, "%s: ", jacks[i].name); err = snd_usb_lock_shutdown(mixer->chip); if (err < 0) return; err = snd_usb_ctl_msg(mixer->chip->dev, usb_rcvctrlpipe(mixer->chip->dev, 0), UAC_GET_MEM, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE, 0, jacks[i].unitid << 8, buf, 3); snd_usb_unlock_shutdown(mixer->chip); if (err == 3 && (buf[0] == 3 || buf[0] == 6)) snd_iprintf(buffer, "%02x %02x\n", buf[1], buf[2]); else snd_iprintf(buffer, "?\n"); } } /* EMU0204 */ static int snd_emu0204_ch_switch_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { static const char * const texts[2] = {"1/2", "3/4"}; return snd_ctl_enum_info(uinfo, 1, ARRAY_SIZE(texts), texts); } static int snd_emu0204_ch_switch_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { ucontrol->value.enumerated.item[0] = kcontrol->private_value; return 0; } static int snd_emu0204_ch_switch_update(struct usb_mixer_interface *mixer, int value) { struct snd_usb_audio *chip = mixer->chip; int err; unsigned char buf[2]; err = snd_usb_lock_shutdown(chip); if (err < 0) return err; buf[0] = 0x01; buf[1] = value ? 0x02 : 0x01; err = snd_usb_ctl_msg(chip->dev, usb_sndctrlpipe(chip->dev, 0), UAC_SET_CUR, USB_RECIP_INTERFACE | USB_TYPE_CLASS | USB_DIR_OUT, 0x0400, 0x0e00, buf, 2); snd_usb_unlock_shutdown(chip); return err; } static int snd_emu0204_ch_switch_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct usb_mixer_elem_list *list = snd_kcontrol_chip(kcontrol); struct usb_mixer_interface *mixer = list->mixer; unsigned int value = ucontrol->value.enumerated.item[0]; int err; if (value > 1) return -EINVAL; if (value == kcontrol->private_value) return 0; kcontrol->private_value = value; err = snd_emu0204_ch_switch_update(mixer, value); return err < 0 ? err : 1; } static int snd_emu0204_ch_switch_resume(struct usb_mixer_elem_list *list) { return snd_emu0204_ch_switch_update(list->mixer, list->kctl->private_value); } static const struct snd_kcontrol_new snd_emu0204_control = { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = "Front Jack Channels", .info = snd_emu0204_ch_switch_info, .get = snd_emu0204_ch_switch_get, .put = snd_emu0204_ch_switch_put, .private_value = 0, }; static int snd_emu0204_controls_create(struct usb_mixer_interface *mixer) { return add_single_ctl_with_resume(mixer, 0, snd_emu0204_ch_switch_resume, &snd_emu0204_control, NULL); } /* ASUS Xonar U1 / U3 controls */ static int snd_xonar_u1_switch_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { ucontrol->value.integer.value[0] = !!(kcontrol->private_value & 0x02); return 0; } static int snd_xonar_u1_switch_update(struct usb_mixer_interface *mixer, unsigned char status) { struct snd_usb_audio *chip = mixer->chip; int err; err = snd_usb_lock_shutdown(chip); if (err < 0) return err; err = snd_usb_ctl_msg(chip->dev, usb_sndctrlpipe(chip->dev, 0), 0x08, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_OTHER, 50, 0, &status, 1); snd_usb_unlock_shutdown(chip); return err; } static int snd_xonar_u1_switch_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct usb_mixer_elem_list *list = snd_kcontrol_chip(kcontrol); u8 old_status, new_status; int err; old_status = kcontrol->private_value; if (ucontrol->value.integer.value[0]) new_status = old_status | 0x02; else new_status = old_status & ~0x02; if (new_status == old_status) return 0; kcontrol->private_value = new_status; err = snd_xonar_u1_switch_update(list->mixer, new_status); return err < 0 ? err : 1; } static int snd_xonar_u1_switch_resume(struct usb_mixer_elem_list *list) { return snd_xonar_u1_switch_update(list->mixer, list->kctl->private_value); } static const struct snd_kcontrol_new snd_xonar_u1_output_switch = { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = "Digital Playback Switch", .info = snd_ctl_boolean_mono_info, .get = snd_xonar_u1_switch_get, .put = snd_xonar_u1_switch_put, .private_value = 0x05, }; static int snd_xonar_u1_controls_create(struct usb_mixer_interface *mixer) { return add_single_ctl_with_resume(mixer, 0, snd_xonar_u1_switch_resume, &snd_xonar_u1_output_switch, NULL); } /* Digidesign Mbox 1 helper functions */ static int snd_mbox1_is_spdif_synced(struct snd_usb_audio *chip) { unsigned char buff[3]; int err; int is_spdif_synced; /* Read clock source */ err = snd_usb_ctl_msg(chip->dev, usb_rcvctrlpipe(chip->dev, 0), 0x81, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_ENDPOINT, 0x100, 0x81, buff, 3); if (err < 0) return err; /* spdif sync: buff is all zeroes */ is_spdif_synced = !(buff[0] | buff[1] | buff[2]); return is_spdif_synced; } static int snd_mbox1_set_clk_source(struct snd_usb_audio *chip, int rate_or_zero) { /* 2 possibilities: Internal -> expects sample rate * S/PDIF sync -> expects rate = 0 */ unsigned char buff[3]; buff[0] = (rate_or_zero >> 0) & 0xff; buff[1] = (rate_or_zero >> 8) & 0xff; buff[2] = (rate_or_zero >> 16) & 0xff; /* Set clock source */ return snd_usb_ctl_msg(chip->dev, usb_sndctrlpipe(chip->dev, 0), 0x1, USB_TYPE_CLASS | USB_RECIP_ENDPOINT, 0x100, 0x81, buff, 3); } static int snd_mbox1_is_spdif_input(struct snd_usb_audio *chip) { /* Hardware gives 2 possibilities: ANALOG Source -> 0x01 * S/PDIF Source -> 0x02 */ int err; unsigned char source[1]; /* Read input source */ err = snd_usb_ctl_msg(chip->dev, usb_rcvctrlpipe(chip->dev, 0), 0x81, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE, 0x00, 0x500, source, 1); if (err < 0) return err; return (source[0] == 2); } static int snd_mbox1_set_input_source(struct snd_usb_audio *chip, int is_spdif) { /* NB: Setting the input source to S/PDIF resets the clock source to S/PDIF * Hardware expects 2 possibilities: ANALOG Source -> 0x01 * S/PDIF Source -> 0x02 */ unsigned char buff[1]; buff[0] = (is_spdif & 1) + 1; /* Set input source */ return snd_usb_ctl_msg(chip->dev, usb_sndctrlpipe(chip->dev, 0), 0x1, USB_TYPE_CLASS | USB_RECIP_INTERFACE, 0x00, 0x500, buff, 1); } /* Digidesign Mbox 1 clock source switch (internal/spdif) */ static int snd_mbox1_clk_switch_get(struct snd_kcontrol *kctl, struct snd_ctl_elem_value *ucontrol) { struct usb_mixer_elem_list *list = snd_kcontrol_chip(kctl); struct snd_usb_audio *chip = list->mixer->chip; int err; err = snd_usb_lock_shutdown(chip); if (err < 0) goto err; err = snd_mbox1_is_spdif_synced(chip); if (err < 0) goto err; kctl->private_value = err; err = 0; ucontrol->value.enumerated.item[0] = kctl->private_value; err: snd_usb_unlock_shutdown(chip); return err; } static int snd_mbox1_clk_switch_update(struct usb_mixer_interface *mixer, int is_spdif_sync) { struct snd_usb_audio *chip = mixer->chip; int err; err = snd_usb_lock_shutdown(chip); if (err < 0) return err; err = snd_mbox1_is_spdif_input(chip); if (err < 0) goto err; err = snd_mbox1_is_spdif_synced(chip); if (err < 0) goto err; /* FIXME: hardcoded sample rate */ err = snd_mbox1_set_clk_source(chip, is_spdif_sync ? 0 : 48000); if (err < 0) goto err; err = snd_mbox1_is_spdif_synced(chip); err: snd_usb_unlock_shutdown(chip); return err; } static int snd_mbox1_clk_switch_put(struct snd_kcontrol *kctl, struct snd_ctl_elem_value *ucontrol) { struct usb_mixer_elem_list *list = snd_kcontrol_chip(kctl); struct usb_mixer_interface *mixer = list->mixer; int err; bool cur_val, new_val; cur_val = kctl->private_value; new_val = ucontrol->value.enumerated.item[0]; if (cur_val == new_val) return 0; kctl->private_value = new_val; err = snd_mbox1_clk_switch_update(mixer, new_val); return err < 0 ? err : 1; } static int snd_mbox1_clk_switch_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { static const char *const texts[2] = { "Internal", "S/PDIF" }; return snd_ctl_enum_info(uinfo, 1, ARRAY_SIZE(texts), texts); } static int snd_mbox1_clk_switch_resume(struct usb_mixer_elem_list *list) { return snd_mbox1_clk_switch_update(list->mixer, list->kctl->private_value); } /* Digidesign Mbox 1 input source switch (analog/spdif) */ static int snd_mbox1_src_switch_get(struct snd_kcontrol *kctl, struct snd_ctl_elem_value *ucontrol) { ucontrol->value.enumerated.item[0] = kctl->private_value; return 0; } static int snd_mbox1_src_switch_update(struct usb_mixer_interface *mixer, int is_spdif_input) { struct snd_usb_audio *chip = mixer->chip; int err; err = snd_usb_lock_shutdown(chip); if (err < 0) return err; err = snd_mbox1_is_spdif_input(chip); if (err < 0) goto err; err = snd_mbox1_set_input_source(chip, is_spdif_input); if (err < 0) goto err; err = snd_mbox1_is_spdif_input(chip); if (err < 0) goto err; err = snd_mbox1_is_spdif_synced(chip); err: snd_usb_unlock_shutdown(chip); return err; } static int snd_mbox1_src_switch_put(struct snd_kcontrol *kctl, struct snd_ctl_elem_value *ucontrol) { struct usb_mixer_elem_list *list = snd_kcontrol_chip(kctl); struct usb_mixer_interface *mixer = list->mixer; int err; bool cur_val, new_val; cur_val = kctl->private_value; new_val = ucontrol->value.enumerated.item[0]; if (cur_val == new_val) return 0; kctl->private_value = new_val; err = snd_mbox1_src_switch_update(mixer, new_val); return err < 0 ? err : 1; } static int snd_mbox1_src_switch_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { static const char *const texts[2] = { "Analog", "S/PDIF" }; return snd_ctl_enum_info(uinfo, 1, ARRAY_SIZE(texts), texts); } static int snd_mbox1_src_switch_resume(struct usb_mixer_elem_list *list) { return snd_mbox1_src_switch_update(list->mixer, list->kctl->private_value); } static const struct snd_kcontrol_new snd_mbox1_clk_switch = { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = "Clock Source", .index = 0, .access = SNDRV_CTL_ELEM_ACCESS_READWRITE, .info = snd_mbox1_clk_switch_info, .get = snd_mbox1_clk_switch_get, .put = snd_mbox1_clk_switch_put, .private_value = 0 }; static const struct snd_kcontrol_new snd_mbox1_src_switch = { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = "Input Source", .index = 1, .access = SNDRV_CTL_ELEM_ACCESS_READWRITE, .info = snd_mbox1_src_switch_info, .get = snd_mbox1_src_switch_get, .put = snd_mbox1_src_switch_put, .private_value = 0 }; static int snd_mbox1_controls_create(struct usb_mixer_interface *mixer) { int err; err = add_single_ctl_with_resume(mixer, 0, snd_mbox1_clk_switch_resume, &snd_mbox1_clk_switch, NULL); if (err < 0) return err; return add_single_ctl_with_resume(mixer, 1, snd_mbox1_src_switch_resume, &snd_mbox1_src_switch, NULL); } /* Native Instruments device quirks */ #define _MAKE_NI_CONTROL(bRequest,wIndex) ((bRequest) << 16 | (wIndex)) static int snd_ni_control_init_val(struct usb_mixer_interface *mixer, struct snd_kcontrol *kctl) { struct usb_device *dev = mixer->chip->dev; unsigned int pval = kctl->private_value; u8 value; int err; err = snd_usb_ctl_msg(dev, usb_rcvctrlpipe(dev, 0), (pval >> 16) & 0xff, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, 0, pval & 0xffff, &value, 1); if (err < 0) { dev_err(&dev->dev, "unable to issue vendor read request (ret = %d)", err); return err; } kctl->private_value |= ((unsigned int)value << 24); return 0; } static int snd_nativeinstruments_control_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { ucontrol->value.integer.value[0] = kcontrol->private_value >> 24; return 0; } static int snd_ni_update_cur_val(struct usb_mixer_elem_list *list) { struct snd_usb_audio *chip = list->mixer->chip; unsigned int pval = list->kctl->private_value; int err; err = snd_usb_lock_shutdown(chip); if (err < 0) return err; err = usb_control_msg(chip->dev, usb_sndctrlpipe(chip->dev, 0), (pval >> 16) & 0xff, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, pval >> 24, pval & 0xffff, NULL, 0, 1000); snd_usb_unlock_shutdown(chip); return err; } static int snd_nativeinstruments_control_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct usb_mixer_elem_list *list = snd_kcontrol_chip(kcontrol); u8 oldval = (kcontrol->private_value >> 24) & 0xff; u8 newval = ucontrol->value.integer.value[0]; int err; if (oldval == newval) return 0; kcontrol->private_value &= ~(0xff << 24); kcontrol->private_value |= (unsigned int)newval << 24; err = snd_ni_update_cur_val(list); return err < 0 ? err : 1; } static const struct snd_kcontrol_new snd_nativeinstruments_ta6_mixers[] = { { .name = "Direct Thru Channel A", .private_value = _MAKE_NI_CONTROL(0x01, 0x03), }, { .name = "Direct Thru Channel B", .private_value = _MAKE_NI_CONTROL(0x01, 0x05), }, { .name = "Phono Input Channel A", .private_value = _MAKE_NI_CONTROL(0x02, 0x03), }, { .name = "Phono Input Channel B", .private_value = _MAKE_NI_CONTROL(0x02, 0x05), }, }; static const struct snd_kcontrol_new snd_nativeinstruments_ta10_mixers[] = { { .name = "Direct Thru Channel A", .private_value = _MAKE_NI_CONTROL(0x01, 0x03), }, { .name = "Direct Thru Channel B", .private_value = _MAKE_NI_CONTROL(0x01, 0x05), }, { .name = "Direct Thru Channel C", .private_value = _MAKE_NI_CONTROL(0x01, 0x07), }, { .name = "Direct Thru Channel D", .private_value = _MAKE_NI_CONTROL(0x01, 0x09), }, { .name = "Phono Input Channel A", .private_value = _MAKE_NI_CONTROL(0x02, 0x03), }, { .name = "Phono Input Channel B", .private_value = _MAKE_NI_CONTROL(0x02, 0x05), }, { .name = "Phono Input Channel C", .private_value = _MAKE_NI_CONTROL(0x02, 0x07), }, { .name = "Phono Input Channel D", .private_value = _MAKE_NI_CONTROL(0x02, 0x09), }, }; static int snd_nativeinstruments_create_mixer(struct usb_mixer_interface *mixer, const struct snd_kcontrol_new *kc, unsigned int count) { int i, err = 0; struct snd_kcontrol_new template = { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .access = SNDRV_CTL_ELEM_ACCESS_READWRITE, .get = snd_nativeinstruments_control_get, .put = snd_nativeinstruments_control_put, .info = snd_ctl_boolean_mono_info, }; for (i = 0; i < count; i++) { struct usb_mixer_elem_list *list; template.name = kc[i].name; template.private_value = kc[i].private_value; err = add_single_ctl_with_resume(mixer, 0, snd_ni_update_cur_val, &template, &list); if (err < 0) break; snd_ni_control_init_val(mixer, list->kctl); } return err; } /* M-Audio FastTrack Ultra quirks */ /* FTU Effect switch (also used by C400/C600) */ static int snd_ftu_eff_switch_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { static const char *const texts[8] = { "Room 1", "Room 2", "Room 3", "Hall 1", "Hall 2", "Plate", "Delay", "Echo" }; return snd_ctl_enum_info(uinfo, 1, ARRAY_SIZE(texts), texts); } static int snd_ftu_eff_switch_init(struct usb_mixer_interface *mixer, struct snd_kcontrol *kctl) { struct usb_device *dev = mixer->chip->dev; unsigned int pval = kctl->private_value; int err; unsigned char value[2]; value[0] = 0x00; value[1] = 0x00; err = snd_usb_ctl_msg(dev, usb_rcvctrlpipe(dev, 0), UAC_GET_CUR, USB_RECIP_INTERFACE | USB_TYPE_CLASS | USB_DIR_IN, pval & 0xff00, snd_usb_ctrl_intf(mixer->chip) | ((pval & 0xff) << 8), value, 2); if (err < 0) return err; kctl->private_value |= (unsigned int)value[0] << 24; return 0; } static int snd_ftu_eff_switch_get(struct snd_kcontrol *kctl, struct snd_ctl_elem_value *ucontrol) { ucontrol->value.enumerated.item[0] = kctl->private_value >> 24; return 0; } static int snd_ftu_eff_switch_update(struct usb_mixer_elem_list *list) { struct snd_usb_audio *chip = list->mixer->chip; unsigned int pval = list->kctl->private_value; unsigned char value[2]; int err; value[0] = pval >> 24; value[1] = 0; err = snd_usb_lock_shutdown(chip); if (err < 0) return err; err = snd_usb_ctl_msg(chip->dev, usb_sndctrlpipe(chip->dev, 0), UAC_SET_CUR, USB_RECIP_INTERFACE | USB_TYPE_CLASS | USB_DIR_OUT, pval & 0xff00, snd_usb_ctrl_intf(chip) | ((pval & 0xff) << 8), value, 2); snd_usb_unlock_shutdown(chip); return err; } static int snd_ftu_eff_switch_put(struct snd_kcontrol *kctl, struct snd_ctl_elem_value *ucontrol) { struct usb_mixer_elem_list *list = snd_kcontrol_chip(kctl); unsigned int pval = list->kctl->private_value; int cur_val, err, new_val; cur_val = pval >> 24; new_val = ucontrol->value.enumerated.item[0]; if (cur_val == new_val) return 0; kctl->private_value &= ~(0xff << 24); kctl->private_value |= new_val << 24; err = snd_ftu_eff_switch_update(list); return err < 0 ? err : 1; } static int snd_ftu_create_effect_switch(struct usb_mixer_interface *mixer, int validx, int bUnitID) { static struct snd_kcontrol_new template = { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = "Effect Program Switch", .index = 0, .access = SNDRV_CTL_ELEM_ACCESS_READWRITE, .info = snd_ftu_eff_switch_info, .get = snd_ftu_eff_switch_get, .put = snd_ftu_eff_switch_put }; struct usb_mixer_elem_list *list; int err; err = add_single_ctl_with_resume(mixer, bUnitID, snd_ftu_eff_switch_update, &template, &list); if (err < 0) return err; list->kctl->private_value = (validx << 8) | bUnitID; snd_ftu_eff_switch_init(mixer, list->kctl); return 0; } /* Create volume controls for FTU devices*/ static int snd_ftu_create_volume_ctls(struct usb_mixer_interface *mixer) { char name[64]; unsigned int control, cmask; int in, out, err; const unsigned int id = 5; const int val_type = USB_MIXER_S16; for (out = 0; out < 8; out++) { control = out + 1; for (in = 0; in < 8; in++) { cmask = 1 << in; snprintf(name, sizeof(name), "AIn%d - Out%d Capture Volume", in + 1, out + 1); err = snd_create_std_mono_ctl(mixer, id, control, cmask, val_type, name, &snd_usb_mixer_vol_tlv); if (err < 0) return err; } for (in = 8; in < 16; in++) { cmask = 1 << in; snprintf(name, sizeof(name), "DIn%d - Out%d Playback Volume", in - 7, out + 1); err = snd_create_std_mono_ctl(mixer, id, control, cmask, val_type, name, &snd_usb_mixer_vol_tlv); if (err < 0) return err; } } return 0; } /* This control needs a volume quirk, see mixer.c */ static int snd_ftu_create_effect_volume_ctl(struct usb_mixer_interface *mixer) { static const char name[] = "Effect Volume"; const unsigned int id = 6; const int val_type = USB_MIXER_U8; const unsigned int control = 2; const unsigned int cmask = 0; return snd_create_std_mono_ctl(mixer, id, control, cmask, val_type, name, snd_usb_mixer_vol_tlv); } /* This control needs a volume quirk, see mixer.c */ static int snd_ftu_create_effect_duration_ctl(struct usb_mixer_interface *mixer) { static const char name[] = "Effect Duration"; const unsigned int id = 6; const int val_type = USB_MIXER_S16; const unsigned int control = 3; const unsigned int cmask = 0; return snd_create_std_mono_ctl(mixer, id, control, cmask, val_type, name, snd_usb_mixer_vol_tlv); } /* This control needs a volume quirk, see mixer.c */ static int snd_ftu_create_effect_feedback_ctl(struct usb_mixer_interface *mixer) { static const char name[] = "Effect Feedback Volume"; const unsigned int id = 6; const int val_type = USB_MIXER_U8; const unsigned int control = 4; const unsigned int cmask = 0; return snd_create_std_mono_ctl(mixer, id, control, cmask, val_type, name, NULL); } static int snd_ftu_create_effect_return_ctls(struct usb_mixer_interface *mixer) { unsigned int cmask; int err, ch; char name[48]; const unsigned int id = 7; const int val_type = USB_MIXER_S16; const unsigned int control = 7; for (ch = 0; ch < 4; ++ch) { cmask = 1 << ch; snprintf(name, sizeof(name), "Effect Return %d Volume", ch + 1); err = snd_create_std_mono_ctl(mixer, id, control, cmask, val_type, name, snd_usb_mixer_vol_tlv); if (err < 0) return err; } return 0; } static int snd_ftu_create_effect_send_ctls(struct usb_mixer_interface *mixer) { unsigned int cmask; int err, ch; char name[48]; const unsigned int id = 5; const int val_type = USB_MIXER_S16; const unsigned int control = 9; for (ch = 0; ch < 8; ++ch) { cmask = 1 << ch; snprintf(name, sizeof(name), "Effect Send AIn%d Volume", ch + 1); err = snd_create_std_mono_ctl(mixer, id, control, cmask, val_type, name, snd_usb_mixer_vol_tlv); if (err < 0) return err; } for (ch = 8; ch < 16; ++ch) { cmask = 1 << ch; snprintf(name, sizeof(name), "Effect Send DIn%d Volume", ch - 7); err = snd_create_std_mono_ctl(mixer, id, control, cmask, val_type, name, snd_usb_mixer_vol_tlv); if (err < 0) return err; } return 0; } static int snd_ftu_create_mixer(struct usb_mixer_interface *mixer) { int err; err = snd_ftu_create_volume_ctls(mixer); if (err < 0) return err; err = snd_ftu_create_effect_switch(mixer, 1, 6); if (err < 0) return err; err = snd_ftu_create_effect_volume_ctl(mixer); if (err < 0) return err; err = snd_ftu_create_effect_duration_ctl(mixer); if (err < 0) return err; err = snd_ftu_create_effect_feedback_ctl(mixer); if (err < 0) return err; err = snd_ftu_create_effect_return_ctls(mixer); if (err < 0) return err; err = snd_ftu_create_effect_send_ctls(mixer); if (err < 0) return err; return 0; } void snd_emuusb_set_samplerate(struct snd_usb_audio *chip, unsigned char samplerate_id) { struct usb_mixer_interface *mixer; struct usb_mixer_elem_info *cval; int unitid = 12; /* SampleRate ExtensionUnit ID */ list_for_each_entry(mixer, &chip->mixer_list, list) { if (mixer->id_elems[unitid]) { cval = mixer_elem_list_to_info(mixer->id_elems[unitid]); snd_usb_mixer_set_ctl_value(cval, UAC_SET_CUR, cval->control << 8, samplerate_id); snd_usb_mixer_notify_id(mixer, unitid); break; } } } /* M-Audio Fast Track C400/C600 */ /* C400/C600 volume controls, this control needs a volume quirk, see mixer.c */ static int snd_c400_create_vol_ctls(struct usb_mixer_interface *mixer) { char name[64]; unsigned int cmask, offset; int out, chan, err; int num_outs = 0; int num_ins = 0; const unsigned int id = 0x40; const int val_type = USB_MIXER_S16; const int control = 1; switch (mixer->chip->usb_id) { case USB_ID(0x0763, 0x2030): num_outs = 6; num_ins = 4; break; case USB_ID(0x0763, 0x2031): num_outs = 8; num_ins = 6; break; } for (chan = 0; chan < num_outs + num_ins; chan++) { for (out = 0; out < num_outs; out++) { if (chan < num_outs) { snprintf(name, sizeof(name), "PCM%d-Out%d Playback Volume", chan + 1, out + 1); } else { snprintf(name, sizeof(name), "In%d-Out%d Playback Volume", chan - num_outs + 1, out + 1); } cmask = (out == 0) ? 0 : 1 << (out - 1); offset = chan * num_outs; err = snd_create_std_mono_ctl_offset(mixer, id, control, cmask, val_type, offset, name, &snd_usb_mixer_vol_tlv); if (err < 0) return err; } } return 0; } /* This control needs a volume quirk, see mixer.c */ static int snd_c400_create_effect_volume_ctl(struct usb_mixer_interface *mixer) { static const char name[] = "Effect Volume"; const unsigned int id = 0x43; const int val_type = USB_MIXER_U8; const unsigned int control = 3; const unsigned int cmask = 0; return snd_create_std_mono_ctl(mixer, id, control, cmask, val_type, name, snd_usb_mixer_vol_tlv); } /* This control needs a volume quirk, see mixer.c */ static int snd_c400_create_effect_duration_ctl(struct usb_mixer_interface *mixer) { static const char name[] = "Effect Duration"; const unsigned int id = 0x43; const int val_type = USB_MIXER_S16; const unsigned int control = 4; const unsigned int cmask = 0; return snd_create_std_mono_ctl(mixer, id, control, cmask, val_type, name, snd_usb_mixer_vol_tlv); } /* This control needs a volume quirk, see mixer.c */ static int snd_c400_create_effect_feedback_ctl(struct usb_mixer_interface *mixer) { static const char name[] = "Effect Feedback Volume"; const unsigned int id = 0x43; const int val_type = USB_MIXER_U8; const unsigned int control = 5; const unsigned int cmask = 0; return snd_create_std_mono_ctl(mixer, id, control, cmask, val_type, name, NULL); } static int snd_c400_create_effect_vol_ctls(struct usb_mixer_interface *mixer) { char name[64]; unsigned int cmask; int chan, err; int num_outs = 0; int num_ins = 0; const unsigned int id = 0x42; const int val_type = USB_MIXER_S16; const int control = 1; switch (mixer->chip->usb_id) { case USB_ID(0x0763, 0x2030): num_outs = 6; num_ins = 4; break; case USB_ID(0x0763, 0x2031): num_outs = 8; num_ins = 6; break; } for (chan = 0; chan < num_outs + num_ins; chan++) { if (chan < num_outs) { snprintf(name, sizeof(name), "Effect Send DOut%d", chan + 1); } else { snprintf(name, sizeof(name), "Effect Send AIn%d", chan - num_outs + 1); } cmask = (chan == 0) ? 0 : 1 << (chan - 1); err = snd_create_std_mono_ctl(mixer, id, control, cmask, val_type, name, &snd_usb_mixer_vol_tlv); if (err < 0) return err; } return 0; } static int snd_c400_create_effect_ret_vol_ctls(struct usb_mixer_interface *mixer) { char name[64]; unsigned int cmask; int chan, err; int num_outs = 0; int offset = 0; const unsigned int id = 0x40; const int val_type = USB_MIXER_S16; const int control = 1; switch (mixer->chip->usb_id) { case USB_ID(0x0763, 0x2030): num_outs = 6; offset = 0x3c; /* { 0x3c, 0x43, 0x3e, 0x45, 0x40, 0x47 } */ break; case USB_ID(0x0763, 0x2031): num_outs = 8; offset = 0x70; /* { 0x70, 0x79, 0x72, 0x7b, 0x74, 0x7d, 0x76, 0x7f } */ break; } for (chan = 0; chan < num_outs; chan++) { snprintf(name, sizeof(name), "Effect Return %d", chan + 1); cmask = (chan == 0) ? 0 : 1 << (chan + (chan % 2) * num_outs - 1); err = snd_create_std_mono_ctl_offset(mixer, id, control, cmask, val_type, offset, name, &snd_usb_mixer_vol_tlv); if (err < 0) return err; } return 0; } static int snd_c400_create_mixer(struct usb_mixer_interface *mixer) { int err; err = snd_c400_create_vol_ctls(mixer); if (err < 0) return err; err = snd_c400_create_effect_vol_ctls(mixer); if (err < 0) return err; err = snd_c400_create_effect_ret_vol_ctls(mixer); if (err < 0) return err; err = snd_ftu_create_effect_switch(mixer, 2, 0x43); if (err < 0) return err; err = snd_c400_create_effect_volume_ctl(mixer); if (err < 0) return err; err = snd_c400_create_effect_duration_ctl(mixer); if (err < 0) return err; err = snd_c400_create_effect_feedback_ctl(mixer); if (err < 0) return err; return 0; } /* * The mixer units for Ebox-44 are corrupt, and even where they * are valid they presents mono controls as L and R channels of * stereo. So we provide a good mixer here. */ static const struct std_mono_table ebox44_table[] = { { .unitid = 4, .control = 1, .cmask = 0x0, .val_type = USB_MIXER_INV_BOOLEAN, .name = "Headphone Playback Switch" }, { .unitid = 4, .control = 2, .cmask = 0x1, .val_type = USB_MIXER_S16, .name = "Headphone A Mix Playback Volume" }, { .unitid = 4, .control = 2, .cmask = 0x2, .val_type = USB_MIXER_S16, .name = "Headphone B Mix Playback Volume" }, { .unitid = 7, .control = 1, .cmask = 0x0, .val_type = USB_MIXER_INV_BOOLEAN, .name = "Output Playback Switch" }, { .unitid = 7, .control = 2, .cmask = 0x1, .val_type = USB_MIXER_S16, .name = "Output A Playback Volume" }, { .unitid = 7, .control = 2, .cmask = 0x2, .val_type = USB_MIXER_S16, .name = "Output B Playback Volume" }, { .unitid = 10, .control = 1, .cmask = 0x0, .val_type = USB_MIXER_INV_BOOLEAN, .name = "Input Capture Switch" }, { .unitid = 10, .control = 2, .cmask = 0x1, .val_type = USB_MIXER_S16, .name = "Input A Capture Volume" }, { .unitid = 10, .control = 2, .cmask = 0x2, .val_type = USB_MIXER_S16, .name = "Input B Capture Volume" }, {} }; /* Audio Advantage Micro II findings: * * Mapping spdif AES bits to vendor register.bit: * AES0: [0 0 0 0 2.3 2.2 2.1 2.0] - default 0x00 * AES1: [3.3 3.2.3.1.3.0 2.7 2.6 2.5 2.4] - default: 0x01 * AES2: [0 0 0 0 0 0 0 0] * AES3: [0 0 0 0 0 0 x 0] - 'x' bit is set basing on standard usb request * (UAC_EP_CS_ATTR_SAMPLE_RATE) for Audio Devices * * power on values: * r2: 0x10 * r3: 0x20 (b7 is zeroed just before playback (except IEC61937) and set * just after it to 0xa0, presumably it disables/mutes some analog * parts when there is no audio.) * r9: 0x28 * * Optical transmitter on/off: * vendor register.bit: 9.1 * 0 - on (0x28 register value) * 1 - off (0x2a register value) * */ static int snd_microii_spdif_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { uinfo->type = SNDRV_CTL_ELEM_TYPE_IEC958; uinfo->count = 1; return 0; } static int snd_microii_spdif_default_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct usb_mixer_elem_list *list = snd_kcontrol_chip(kcontrol); struct snd_usb_audio *chip = list->mixer->chip; int err; struct usb_interface *iface; struct usb_host_interface *alts; unsigned int ep; unsigned char data[3]; int rate; err = snd_usb_lock_shutdown(chip); if (err < 0) return err; ucontrol->value.iec958.status[0] = kcontrol->private_value & 0xff; ucontrol->value.iec958.status[1] = (kcontrol->private_value >> 8) & 0xff; ucontrol->value.iec958.status[2] = 0x00; /* use known values for that card: interface#1 altsetting#1 */ iface = usb_ifnum_to_if(chip->dev, 1); if (!iface || iface->num_altsetting < 2) { err = -EINVAL; goto end; } alts = &iface->altsetting[1]; if (get_iface_desc(alts)->bNumEndpoints < 1) { err = -EINVAL; goto end; } ep = get_endpoint(alts, 0)->bEndpointAddress; err = snd_usb_ctl_msg(chip->dev, usb_rcvctrlpipe(chip->dev, 0), UAC_GET_CUR, USB_TYPE_CLASS | USB_RECIP_ENDPOINT | USB_DIR_IN, UAC_EP_CS_ATTR_SAMPLE_RATE << 8, ep, data, sizeof(data)); if (err < 0) goto end; rate = data[0] | (data[1] << 8) | (data[2] << 16); ucontrol->value.iec958.status[3] = (rate == 48000) ? IEC958_AES3_CON_FS_48000 : IEC958_AES3_CON_FS_44100; err = 0; end: snd_usb_unlock_shutdown(chip); return err; } static int snd_microii_spdif_default_update(struct usb_mixer_elem_list *list) { struct snd_usb_audio *chip = list->mixer->chip; unsigned int pval = list->kctl->private_value; u8 reg; int err; err = snd_usb_lock_shutdown(chip); if (err < 0) return err; reg = ((pval >> 4) & 0xf0) | (pval & 0x0f); err = snd_usb_ctl_msg(chip->dev, usb_sndctrlpipe(chip->dev, 0), UAC_SET_CUR, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_OTHER, reg, 2, NULL, 0); if (err < 0) goto end; reg = (pval & IEC958_AES0_NONAUDIO) ? 0xa0 : 0x20; reg |= (pval >> 12) & 0x0f; err = snd_usb_ctl_msg(chip->dev, usb_sndctrlpipe(chip->dev, 0), UAC_SET_CUR, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_OTHER, reg, 3, NULL, 0); if (err < 0) goto end; end: snd_usb_unlock_shutdown(chip); return err; } static int snd_microii_spdif_default_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct usb_mixer_elem_list *list = snd_kcontrol_chip(kcontrol); unsigned int pval, pval_old; int err; pval = pval_old = kcontrol->private_value; pval &= 0xfffff0f0; pval |= (ucontrol->value.iec958.status[1] & 0x0f) << 8; pval |= (ucontrol->value.iec958.status[0] & 0x0f); pval &= 0xffff0fff; pval |= (ucontrol->value.iec958.status[1] & 0xf0) << 8; /* The frequency bits in AES3 cannot be set via register access. */ /* Silently ignore any bits from the request that cannot be set. */ if (pval == pval_old) return 0; kcontrol->private_value = pval; err = snd_microii_spdif_default_update(list); return err < 0 ? err : 1; } static int snd_microii_spdif_mask_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { ucontrol->value.iec958.status[0] = 0x0f; ucontrol->value.iec958.status[1] = 0xff; ucontrol->value.iec958.status[2] = 0x00; ucontrol->value.iec958.status[3] = 0x00; return 0; } static int snd_microii_spdif_switch_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { ucontrol->value.integer.value[0] = !(kcontrol->private_value & 0x02); return 0; } static int snd_microii_spdif_switch_update(struct usb_mixer_elem_list *list) { struct snd_usb_audio *chip = list->mixer->chip; u8 reg = list->kctl->private_value; int err; err = snd_usb_lock_shutdown(chip); if (err < 0) return err; err = snd_usb_ctl_msg(chip->dev, usb_sndctrlpipe(chip->dev, 0), UAC_SET_CUR, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_OTHER, reg, 9, NULL, 0); snd_usb_unlock_shutdown(chip); return err; } static int snd_microii_spdif_switch_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct usb_mixer_elem_list *list = snd_kcontrol_chip(kcontrol); u8 reg; int err; reg = ucontrol->value.integer.value[0] ? 0x28 : 0x2a; if (reg != list->kctl->private_value) return 0; kcontrol->private_value = reg; err = snd_microii_spdif_switch_update(list); return err < 0 ? err : 1; } static const struct snd_kcontrol_new snd_microii_mixer_spdif[] = { { .iface = SNDRV_CTL_ELEM_IFACE_PCM, .name = SNDRV_CTL_NAME_IEC958("", PLAYBACK, DEFAULT), .info = snd_microii_spdif_info, .get = snd_microii_spdif_default_get, .put = snd_microii_spdif_default_put, .private_value = 0x00000100UL,/* reset value */ }, { .access = SNDRV_CTL_ELEM_ACCESS_READ, .iface = SNDRV_CTL_ELEM_IFACE_PCM, .name = SNDRV_CTL_NAME_IEC958("", PLAYBACK, MASK), .info = snd_microii_spdif_info, .get = snd_microii_spdif_mask_get, }, { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = SNDRV_CTL_NAME_IEC958("", PLAYBACK, SWITCH), .info = snd_ctl_boolean_mono_info, .get = snd_microii_spdif_switch_get, .put = snd_microii_spdif_switch_put, .private_value = 0x00000028UL,/* reset value */ } }; static int snd_microii_controls_create(struct usb_mixer_interface *mixer) { int err, i; static const usb_mixer_elem_resume_func_t resume_funcs[] = { snd_microii_spdif_default_update, NULL, snd_microii_spdif_switch_update }; for (i = 0; i < ARRAY_SIZE(snd_microii_mixer_spdif); ++i) { err = add_single_ctl_with_resume(mixer, 0, resume_funcs[i], &snd_microii_mixer_spdif[i], NULL); if (err < 0) return err; } return 0; } /* Creative Sound Blaster E1 */ static int snd_soundblaster_e1_switch_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { ucontrol->value.integer.value[0] = kcontrol->private_value; return 0; } static int snd_soundblaster_e1_switch_update(struct usb_mixer_interface *mixer, unsigned char state) { struct snd_usb_audio *chip = mixer->chip; int err; unsigned char buff[2]; buff[0] = 0x02; buff[1] = state ? 0x02 : 0x00; err = snd_usb_lock_shutdown(chip); if (err < 0) return err; err = snd_usb_ctl_msg(chip->dev, usb_sndctrlpipe(chip->dev, 0), HID_REQ_SET_REPORT, USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_OUT, 0x0202, 3, buff, 2); snd_usb_unlock_shutdown(chip); return err; } static int snd_soundblaster_e1_switch_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct usb_mixer_elem_list *list = snd_kcontrol_chip(kcontrol); unsigned char value = !!ucontrol->value.integer.value[0]; int err; if (kcontrol->private_value == value) return 0; kcontrol->private_value = value; err = snd_soundblaster_e1_switch_update(list->mixer, value); return err < 0 ? err : 1; } static int snd_soundblaster_e1_switch_resume(struct usb_mixer_elem_list *list) { return snd_soundblaster_e1_switch_update(list->mixer, list->kctl->private_value); } static int snd_soundblaster_e1_switch_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { static const char *const texts[2] = { "Mic", "Aux" }; return snd_ctl_enum_info(uinfo, 1, ARRAY_SIZE(texts), texts); } static const struct snd_kcontrol_new snd_soundblaster_e1_input_switch = { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = "Input Source", .info = snd_soundblaster_e1_switch_info, .get = snd_soundblaster_e1_switch_get, .put = snd_soundblaster_e1_switch_put, .private_value = 0, }; static int snd_soundblaster_e1_switch_create(struct usb_mixer_interface *mixer) { return add_single_ctl_with_resume(mixer, 0, snd_soundblaster_e1_switch_resume, &snd_soundblaster_e1_input_switch, NULL); } /* * Dell WD15 dock jack detection * * The WD15 contains an ALC4020 USB audio controller and ALC3263 audio codec * from Realtek. It is a UAC 1 device, and UAC 1 does not support jack * detection. Instead, jack detection works by sending HD Audio commands over * vendor-type USB messages. */ #define HDA_VERB_CMD(V, N, D) (((N) << 20) | ((V) << 8) | (D)) #define REALTEK_HDA_VALUE 0x0038 #define REALTEK_HDA_SET 62 #define REALTEK_MANUAL_MODE 72 #define REALTEK_HDA_GET_OUT 88 #define REALTEK_HDA_GET_IN 89 #define REALTEK_AUDIO_FUNCTION_GROUP 0x01 #define REALTEK_LINE1 0x1a #define REALTEK_VENDOR_REGISTERS 0x20 #define REALTEK_HP_OUT 0x21 #define REALTEK_CBJ_CTRL2 0x50 #define REALTEK_JACK_INTERRUPT_NODE 5 #define REALTEK_MIC_FLAG 0x100 static int realtek_hda_set(struct snd_usb_audio *chip, u32 cmd) { struct usb_device *dev = chip->dev; __be32 buf = cpu_to_be32(cmd); return snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), REALTEK_HDA_SET, USB_RECIP_DEVICE | USB_TYPE_VENDOR | USB_DIR_OUT, REALTEK_HDA_VALUE, 0, &buf, sizeof(buf)); } static int realtek_hda_get(struct snd_usb_audio *chip, u32 cmd, u32 *value) { struct usb_device *dev = chip->dev; int err; __be32 buf = cpu_to_be32(cmd); err = snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), REALTEK_HDA_GET_OUT, USB_RECIP_DEVICE | USB_TYPE_VENDOR | USB_DIR_OUT, REALTEK_HDA_VALUE, 0, &buf, sizeof(buf)); if (err < 0) return err; err = snd_usb_ctl_msg(dev, usb_rcvctrlpipe(dev, 0), REALTEK_HDA_GET_IN, USB_RECIP_DEVICE | USB_TYPE_VENDOR | USB_DIR_IN, REALTEK_HDA_VALUE, 0, &buf, sizeof(buf)); if (err < 0) return err; *value = be32_to_cpu(buf); return 0; } static int realtek_ctl_connector_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct usb_mixer_elem_info *cval = kcontrol->private_data; struct snd_usb_audio *chip = cval->head.mixer->chip; u32 pv = kcontrol->private_value; u32 node_id = pv & 0xff; u32 sense; u32 cbj_ctrl2; bool presence; int err; err = snd_usb_lock_shutdown(chip); if (err < 0) return err; err = realtek_hda_get(chip, HDA_VERB_CMD(AC_VERB_GET_PIN_SENSE, node_id, 0), &sense); if (err < 0) goto err; if (pv & REALTEK_MIC_FLAG) { err = realtek_hda_set(chip, HDA_VERB_CMD(AC_VERB_SET_COEF_INDEX, REALTEK_VENDOR_REGISTERS, REALTEK_CBJ_CTRL2)); if (err < 0) goto err; err = realtek_hda_get(chip, HDA_VERB_CMD(AC_VERB_GET_PROC_COEF, REALTEK_VENDOR_REGISTERS, 0), &cbj_ctrl2); if (err < 0) goto err; } err: snd_usb_unlock_shutdown(chip); if (err < 0) return err; presence = sense & AC_PINSENSE_PRESENCE; if (pv & REALTEK_MIC_FLAG) presence = presence && (cbj_ctrl2 & 0x0070) == 0x0070; ucontrol->value.integer.value[0] = presence; return 0; } static const struct snd_kcontrol_new realtek_connector_ctl_ro = { .iface = SNDRV_CTL_ELEM_IFACE_CARD, .name = "", /* will be filled later manually */ .access = SNDRV_CTL_ELEM_ACCESS_READ, .info = snd_ctl_boolean_mono_info, .get = realtek_ctl_connector_get, }; static int realtek_resume_jack(struct usb_mixer_elem_list *list) { snd_ctl_notify(list->mixer->chip->card, SNDRV_CTL_EVENT_MASK_VALUE, &list->kctl->id); return 0; } static int realtek_add_jack(struct usb_mixer_interface *mixer, char *name, u32 val) { struct usb_mixer_elem_info *cval; struct snd_kcontrol *kctl; cval = kzalloc(sizeof(*cval), GFP_KERNEL); if (!cval) return -ENOMEM; snd_usb_mixer_elem_init_std(&cval->head, mixer, REALTEK_JACK_INTERRUPT_NODE); cval->head.resume = realtek_resume_jack; cval->val_type = USB_MIXER_BOOLEAN; cval->channels = 1; cval->min = 0; cval->max = 1; kctl = snd_ctl_new1(&realtek_connector_ctl_ro, cval); if (!kctl) { kfree(cval); return -ENOMEM; } kctl->private_value = val; strscpy(kctl->id.name, name, sizeof(kctl->id.name)); kctl->private_free = snd_usb_mixer_elem_free; return snd_usb_mixer_add_control(&cval->head, kctl); } static int dell_dock_mixer_create(struct usb_mixer_interface *mixer) { int err; struct usb_device *dev = mixer->chip->dev; /* Power down the audio codec to avoid loud pops in the next step. */ realtek_hda_set(mixer->chip, HDA_VERB_CMD(AC_VERB_SET_POWER_STATE, REALTEK_AUDIO_FUNCTION_GROUP, AC_PWRST_D3)); /* * Turn off 'manual mode' in case it was enabled. This removes the need * to power cycle the dock after it was attached to a Windows machine. */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), REALTEK_MANUAL_MODE, USB_RECIP_DEVICE | USB_TYPE_VENDOR | USB_DIR_OUT, 0, 0, NULL, 0); err = realtek_add_jack(mixer, "Line Out Jack", REALTEK_LINE1); if (err < 0) return err; err = realtek_add_jack(mixer, "Headphone Jack", REALTEK_HP_OUT); if (err < 0) return err; err = realtek_add_jack(mixer, "Headset Mic Jack", REALTEK_HP_OUT | REALTEK_MIC_FLAG); if (err < 0) return err; return 0; } static void dell_dock_init_vol(struct snd_usb_audio *chip, int ch, int id) { u16 buf = 0; snd_usb_ctl_msg(chip->dev, usb_sndctrlpipe(chip->dev, 0), UAC_SET_CUR, USB_RECIP_INTERFACE | USB_TYPE_CLASS | USB_DIR_OUT, (UAC_FU_VOLUME << 8) | ch, snd_usb_ctrl_intf(chip) | (id << 8), &buf, 2); } static int dell_dock_mixer_init(struct usb_mixer_interface *mixer) { /* fix to 0dB playback volumes */ dell_dock_init_vol(mixer->chip, 1, 16); dell_dock_init_vol(mixer->chip, 2, 16); dell_dock_init_vol(mixer->chip, 1, 19); dell_dock_init_vol(mixer->chip, 2, 19); return 0; } /* RME Class Compliant device quirks */ #define SND_RME_GET_STATUS1 23 #define SND_RME_GET_CURRENT_FREQ 17 #define SND_RME_CLK_SYSTEM_SHIFT 16 #define SND_RME_CLK_SYSTEM_MASK 0x1f #define SND_RME_CLK_AES_SHIFT 8 #define SND_RME_CLK_SPDIF_SHIFT 12 #define SND_RME_CLK_AES_SPDIF_MASK 0xf #define SND_RME_CLK_SYNC_SHIFT 6 #define SND_RME_CLK_SYNC_MASK 0x3 #define SND_RME_CLK_FREQMUL_SHIFT 18 #define SND_RME_CLK_FREQMUL_MASK 0x7 #define SND_RME_CLK_SYSTEM(x) \ ((x >> SND_RME_CLK_SYSTEM_SHIFT) & SND_RME_CLK_SYSTEM_MASK) #define SND_RME_CLK_AES(x) \ ((x >> SND_RME_CLK_AES_SHIFT) & SND_RME_CLK_AES_SPDIF_MASK) #define SND_RME_CLK_SPDIF(x) \ ((x >> SND_RME_CLK_SPDIF_SHIFT) & SND_RME_CLK_AES_SPDIF_MASK) #define SND_RME_CLK_SYNC(x) \ ((x >> SND_RME_CLK_SYNC_SHIFT) & SND_RME_CLK_SYNC_MASK) #define SND_RME_CLK_FREQMUL(x) \ ((x >> SND_RME_CLK_FREQMUL_SHIFT) & SND_RME_CLK_FREQMUL_MASK) #define SND_RME_CLK_AES_LOCK 0x1 #define SND_RME_CLK_AES_SYNC 0x4 #define SND_RME_CLK_SPDIF_LOCK 0x2 #define SND_RME_CLK_SPDIF_SYNC 0x8 #define SND_RME_SPDIF_IF_SHIFT 4 #define SND_RME_SPDIF_FORMAT_SHIFT 5 #define SND_RME_BINARY_MASK 0x1 #define SND_RME_SPDIF_IF(x) \ ((x >> SND_RME_SPDIF_IF_SHIFT) & SND_RME_BINARY_MASK) #define SND_RME_SPDIF_FORMAT(x) \ ((x >> SND_RME_SPDIF_FORMAT_SHIFT) & SND_RME_BINARY_MASK) static const u32 snd_rme_rate_table[] = { 32000, 44100, 48000, 50000, 64000, 88200, 96000, 100000, 128000, 176400, 192000, 200000, 256000, 352800, 384000, 400000, 512000, 705600, 768000, 800000 }; /* maximum number of items for AES and S/PDIF rates for above table */ #define SND_RME_RATE_IDX_AES_SPDIF_NUM 12 enum snd_rme_domain { SND_RME_DOMAIN_SYSTEM, SND_RME_DOMAIN_AES, SND_RME_DOMAIN_SPDIF }; enum snd_rme_clock_status { SND_RME_CLOCK_NOLOCK, SND_RME_CLOCK_LOCK, SND_RME_CLOCK_SYNC }; static int snd_rme_read_value(struct snd_usb_audio *chip, unsigned int item, u32 *value) { struct usb_device *dev = chip->dev; int err; err = snd_usb_ctl_msg(dev, usb_rcvctrlpipe(dev, 0), item, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, 0, value, sizeof(*value)); if (err < 0) dev_err(&dev->dev, "unable to issue vendor read request %d (ret = %d)", item, err); return err; } static int snd_rme_get_status1(struct snd_kcontrol *kcontrol, u32 *status1) { struct usb_mixer_elem_list *list = snd_kcontrol_chip(kcontrol); struct snd_usb_audio *chip = list->mixer->chip; int err; err = snd_usb_lock_shutdown(chip); if (err < 0) return err; err = snd_rme_read_value(chip, SND_RME_GET_STATUS1, status1); snd_usb_unlock_shutdown(chip); return err; } static int snd_rme_rate_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { u32 status1; u32 rate = 0; int idx; int err; err = snd_rme_get_status1(kcontrol, &status1); if (err < 0) return err; switch (kcontrol->private_value) { case SND_RME_DOMAIN_SYSTEM: idx = SND_RME_CLK_SYSTEM(status1); if (idx < ARRAY_SIZE(snd_rme_rate_table)) rate = snd_rme_rate_table[idx]; break; case SND_RME_DOMAIN_AES: idx = SND_RME_CLK_AES(status1); if (idx < SND_RME_RATE_IDX_AES_SPDIF_NUM) rate = snd_rme_rate_table[idx]; break; case SND_RME_DOMAIN_SPDIF: idx = SND_RME_CLK_SPDIF(status1); if (idx < SND_RME_RATE_IDX_AES_SPDIF_NUM) rate = snd_rme_rate_table[idx]; break; default: return -EINVAL; } ucontrol->value.integer.value[0] = rate; return 0; } static int snd_rme_sync_state_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { u32 status1; int idx = SND_RME_CLOCK_NOLOCK; int err; err = snd_rme_get_status1(kcontrol, &status1); if (err < 0) return err; switch (kcontrol->private_value) { case SND_RME_DOMAIN_AES: /* AES */ if (status1 & SND_RME_CLK_AES_SYNC) idx = SND_RME_CLOCK_SYNC; else if (status1 & SND_RME_CLK_AES_LOCK) idx = SND_RME_CLOCK_LOCK; break; case SND_RME_DOMAIN_SPDIF: /* SPDIF */ if (status1 & SND_RME_CLK_SPDIF_SYNC) idx = SND_RME_CLOCK_SYNC; else if (status1 & SND_RME_CLK_SPDIF_LOCK) idx = SND_RME_CLOCK_LOCK; break; default: return -EINVAL; } ucontrol->value.enumerated.item[0] = idx; return 0; } static int snd_rme_spdif_if_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { u32 status1; int err; err = snd_rme_get_status1(kcontrol, &status1); if (err < 0) return err; ucontrol->value.enumerated.item[0] = SND_RME_SPDIF_IF(status1); return 0; } static int snd_rme_spdif_format_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { u32 status1; int err; err = snd_rme_get_status1(kcontrol, &status1); if (err < 0) return err; ucontrol->value.enumerated.item[0] = SND_RME_SPDIF_FORMAT(status1); return 0; } static int snd_rme_sync_source_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { u32 status1; int err; err = snd_rme_get_status1(kcontrol, &status1); if (err < 0) return err; ucontrol->value.enumerated.item[0] = SND_RME_CLK_SYNC(status1); return 0; } static int snd_rme_current_freq_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct usb_mixer_elem_list *list = snd_kcontrol_chip(kcontrol); struct snd_usb_audio *chip = list->mixer->chip; u32 status1; const u64 num = 104857600000000ULL; u32 den; unsigned int freq; int err; err = snd_usb_lock_shutdown(chip); if (err < 0) return err; err = snd_rme_read_value(chip, SND_RME_GET_STATUS1, &status1); if (err < 0) goto end; err = snd_rme_read_value(chip, SND_RME_GET_CURRENT_FREQ, &den); if (err < 0) goto end; freq = (den == 0) ? 0 : div64_u64(num, den); freq <<= SND_RME_CLK_FREQMUL(status1); ucontrol->value.integer.value[0] = freq; end: snd_usb_unlock_shutdown(chip); return err; } static int snd_rme_rate_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; uinfo->count = 1; switch (kcontrol->private_value) { case SND_RME_DOMAIN_SYSTEM: uinfo->value.integer.min = 32000; uinfo->value.integer.max = 800000; break; case SND_RME_DOMAIN_AES: case SND_RME_DOMAIN_SPDIF: default: uinfo->value.integer.min = 0; uinfo->value.integer.max = 200000; } uinfo->value.integer.step = 0; return 0; } static int snd_rme_sync_state_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { static const char *const sync_states[] = { "No Lock", "Lock", "Sync" }; return snd_ctl_enum_info(uinfo, 1, ARRAY_SIZE(sync_states), sync_states); } static int snd_rme_spdif_if_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { static const char *const spdif_if[] = { "Coaxial", "Optical" }; return snd_ctl_enum_info(uinfo, 1, ARRAY_SIZE(spdif_if), spdif_if); } static int snd_rme_spdif_format_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { static const char *const optical_type[] = { "Consumer", "Professional" }; return snd_ctl_enum_info(uinfo, 1, ARRAY_SIZE(optical_type), optical_type); } static int snd_rme_sync_source_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { static const char *const sync_sources[] = { "Internal", "AES", "SPDIF", "Internal" }; return snd_ctl_enum_info(uinfo, 1, ARRAY_SIZE(sync_sources), sync_sources); } static const struct snd_kcontrol_new snd_rme_controls[] = { { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = "AES Rate", .access = SNDRV_CTL_ELEM_ACCESS_READ | SNDRV_CTL_ELEM_ACCESS_VOLATILE, .info = snd_rme_rate_info, .get = snd_rme_rate_get, .private_value = SND_RME_DOMAIN_AES }, { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = "AES Sync", .access = SNDRV_CTL_ELEM_ACCESS_READ | SNDRV_CTL_ELEM_ACCESS_VOLATILE, .info = snd_rme_sync_state_info, .get = snd_rme_sync_state_get, .private_value = SND_RME_DOMAIN_AES }, { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = "SPDIF Rate", .access = SNDRV_CTL_ELEM_ACCESS_READ | SNDRV_CTL_ELEM_ACCESS_VOLATILE, .info = snd_rme_rate_info, .get = snd_rme_rate_get, .private_value = SND_RME_DOMAIN_SPDIF }, { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = "SPDIF Sync", .access = SNDRV_CTL_ELEM_ACCESS_READ | SNDRV_CTL_ELEM_ACCESS_VOLATILE, .info = snd_rme_sync_state_info, .get = snd_rme_sync_state_get, .private_value = SND_RME_DOMAIN_SPDIF }, { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = "SPDIF Interface", .access = SNDRV_CTL_ELEM_ACCESS_READ | SNDRV_CTL_ELEM_ACCESS_VOLATILE, .info = snd_rme_spdif_if_info, .get = snd_rme_spdif_if_get, }, { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = "SPDIF Format", .access = SNDRV_CTL_ELEM_ACCESS_READ | SNDRV_CTL_ELEM_ACCESS_VOLATILE, .info = snd_rme_spdif_format_info, .get = snd_rme_spdif_format_get, }, { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = "Sync Source", .access = SNDRV_CTL_ELEM_ACCESS_READ | SNDRV_CTL_ELEM_ACCESS_VOLATILE, .info = snd_rme_sync_source_info, .get = snd_rme_sync_source_get }, { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = "System Rate", .access = SNDRV_CTL_ELEM_ACCESS_READ | SNDRV_CTL_ELEM_ACCESS_VOLATILE, .info = snd_rme_rate_info, .get = snd_rme_rate_get, .private_value = SND_RME_DOMAIN_SYSTEM }, { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = "Current Frequency", .access = SNDRV_CTL_ELEM_ACCESS_READ | SNDRV_CTL_ELEM_ACCESS_VOLATILE, .info = snd_rme_rate_info, .get = snd_rme_current_freq_get } }; static int snd_rme_controls_create(struct usb_mixer_interface *mixer) { int err, i; for (i = 0; i < ARRAY_SIZE(snd_rme_controls); ++i) { err = add_single_ctl_with_resume(mixer, 0, NULL, &snd_rme_controls[i], NULL); if (err < 0) return err; } return 0; } /* * RME Babyface Pro (FS) * * These devices exposes a couple of DSP functions via request to EP0. * Switches are available via control registers, while routing is controlled * by controlling the volume on each possible crossing point. * Volume control is linear, from -inf (dec. 0) to +6dB (dec. 65536) with * 0dB being at dec. 32768. */ enum { SND_BBFPRO_CTL_REG1 = 0, SND_BBFPRO_CTL_REG2 }; #define SND_BBFPRO_CTL_REG_MASK 1 #define SND_BBFPRO_CTL_IDX_MASK 0xff #define SND_BBFPRO_CTL_IDX_SHIFT 1 #define SND_BBFPRO_CTL_VAL_MASK 1 #define SND_BBFPRO_CTL_VAL_SHIFT 9 #define SND_BBFPRO_CTL_REG1_CLK_MASTER 0 #define SND_BBFPRO_CTL_REG1_CLK_OPTICAL 1 #define SND_BBFPRO_CTL_REG1_SPDIF_PRO 7 #define SND_BBFPRO_CTL_REG1_SPDIF_EMPH 8 #define SND_BBFPRO_CTL_REG1_SPDIF_OPTICAL 10 #define SND_BBFPRO_CTL_REG2_48V_AN1 0 #define SND_BBFPRO_CTL_REG2_48V_AN2 1 #define SND_BBFPRO_CTL_REG2_SENS_IN3 2 #define SND_BBFPRO_CTL_REG2_SENS_IN4 3 #define SND_BBFPRO_CTL_REG2_PAD_AN1 4 #define SND_BBFPRO_CTL_REG2_PAD_AN2 5 #define SND_BBFPRO_MIXER_IDX_MASK 0x1ff #define SND_BBFPRO_MIXER_VAL_MASK 0x3ffff #define SND_BBFPRO_MIXER_VAL_SHIFT 9 #define SND_BBFPRO_MIXER_VAL_MIN 0 // -inf #define SND_BBFPRO_MIXER_VAL_MAX 65536 // +6dB #define SND_BBFPRO_USBREQ_CTL_REG1 0x10 #define SND_BBFPRO_USBREQ_CTL_REG2 0x17 #define SND_BBFPRO_USBREQ_MIXER 0x12 static int snd_bbfpro_ctl_update(struct usb_mixer_interface *mixer, u8 reg, u8 index, u8 value) { int err; u16 usb_req, usb_idx, usb_val; struct snd_usb_audio *chip = mixer->chip; err = snd_usb_lock_shutdown(chip); if (err < 0) return err; if (reg == SND_BBFPRO_CTL_REG1) { usb_req = SND_BBFPRO_USBREQ_CTL_REG1; if (index == SND_BBFPRO_CTL_REG1_CLK_OPTICAL) { usb_idx = 3; usb_val = value ? 3 : 0; } else { usb_idx = 1 << index; usb_val = value ? usb_idx : 0; } } else { usb_req = SND_BBFPRO_USBREQ_CTL_REG2; usb_idx = 1 << index; usb_val = value ? usb_idx : 0; } err = snd_usb_ctl_msg(chip->dev, usb_sndctrlpipe(chip->dev, 0), usb_req, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, usb_val, usb_idx, NULL, 0); snd_usb_unlock_shutdown(chip); return err; } static int snd_bbfpro_ctl_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { u8 reg, idx, val; int pv; pv = kcontrol->private_value; reg = pv & SND_BBFPRO_CTL_REG_MASK; idx = (pv >> SND_BBFPRO_CTL_IDX_SHIFT) & SND_BBFPRO_CTL_IDX_MASK; val = kcontrol->private_value >> SND_BBFPRO_CTL_VAL_SHIFT; if ((reg == SND_BBFPRO_CTL_REG1 && idx == SND_BBFPRO_CTL_REG1_CLK_OPTICAL) || (reg == SND_BBFPRO_CTL_REG2 && (idx == SND_BBFPRO_CTL_REG2_SENS_IN3 || idx == SND_BBFPRO_CTL_REG2_SENS_IN4))) { ucontrol->value.enumerated.item[0] = val; } else { ucontrol->value.integer.value[0] = val; } return 0; } static int snd_bbfpro_ctl_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { u8 reg, idx; int pv; pv = kcontrol->private_value; reg = pv & SND_BBFPRO_CTL_REG_MASK; idx = (pv >> SND_BBFPRO_CTL_IDX_SHIFT) & SND_BBFPRO_CTL_IDX_MASK; if (reg == SND_BBFPRO_CTL_REG1 && idx == SND_BBFPRO_CTL_REG1_CLK_OPTICAL) { static const char * const texts[2] = { "AutoSync", "Internal" }; return snd_ctl_enum_info(uinfo, 1, 2, texts); } else if (reg == SND_BBFPRO_CTL_REG2 && (idx == SND_BBFPRO_CTL_REG2_SENS_IN3 || idx == SND_BBFPRO_CTL_REG2_SENS_IN4)) { static const char * const texts[2] = { "-10dBV", "+4dBu" }; return snd_ctl_enum_info(uinfo, 1, 2, texts); } uinfo->count = 1; uinfo->value.integer.min = 0; uinfo->value.integer.max = 1; uinfo->type = SNDRV_CTL_ELEM_TYPE_BOOLEAN; return 0; } static int snd_bbfpro_ctl_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { int err; u8 reg, idx; int old_value, pv, val; struct usb_mixer_elem_list *list = snd_kcontrol_chip(kcontrol); struct usb_mixer_interface *mixer = list->mixer; pv = kcontrol->private_value; reg = pv & SND_BBFPRO_CTL_REG_MASK; idx = (pv >> SND_BBFPRO_CTL_IDX_SHIFT) & SND_BBFPRO_CTL_IDX_MASK; old_value = (pv >> SND_BBFPRO_CTL_VAL_SHIFT) & SND_BBFPRO_CTL_VAL_MASK; if ((reg == SND_BBFPRO_CTL_REG1 && idx == SND_BBFPRO_CTL_REG1_CLK_OPTICAL) || (reg == SND_BBFPRO_CTL_REG2 && (idx == SND_BBFPRO_CTL_REG2_SENS_IN3 || idx == SND_BBFPRO_CTL_REG2_SENS_IN4))) { val = ucontrol->value.enumerated.item[0]; } else { val = ucontrol->value.integer.value[0]; } if (val > 1) return -EINVAL; if (val == old_value) return 0; kcontrol->private_value = reg | ((idx & SND_BBFPRO_CTL_IDX_MASK) << SND_BBFPRO_CTL_IDX_SHIFT) | ((val & SND_BBFPRO_CTL_VAL_MASK) << SND_BBFPRO_CTL_VAL_SHIFT); err = snd_bbfpro_ctl_update(mixer, reg, idx, val); return err < 0 ? err : 1; } static int snd_bbfpro_ctl_resume(struct usb_mixer_elem_list *list) { u8 reg, idx; int value, pv; pv = list->kctl->private_value; reg = pv & SND_BBFPRO_CTL_REG_MASK; idx = (pv >> SND_BBFPRO_CTL_IDX_SHIFT) & SND_BBFPRO_CTL_IDX_MASK; value = (pv >> SND_BBFPRO_CTL_VAL_SHIFT) & SND_BBFPRO_CTL_VAL_MASK; return snd_bbfpro_ctl_update(list->mixer, reg, idx, value); } static int snd_bbfpro_vol_update(struct usb_mixer_interface *mixer, u16 index, u32 value) { struct snd_usb_audio *chip = mixer->chip; int err; u16 idx; u16 usb_idx, usb_val; u32 v; err = snd_usb_lock_shutdown(chip); if (err < 0) return err; idx = index & SND_BBFPRO_MIXER_IDX_MASK; // 18 bit linear volume, split so 2 bits end up in index. v = value & SND_BBFPRO_MIXER_VAL_MASK; usb_idx = idx | (v & 0x3) << 14; usb_val = (v >> 2) & 0xffff; err = snd_usb_ctl_msg(chip->dev, usb_sndctrlpipe(chip->dev, 0), SND_BBFPRO_USBREQ_MIXER, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, usb_val, usb_idx, NULL, 0); snd_usb_unlock_shutdown(chip); return err; } static int snd_bbfpro_vol_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { ucontrol->value.integer.value[0] = kcontrol->private_value >> SND_BBFPRO_MIXER_VAL_SHIFT; return 0; } static int snd_bbfpro_vol_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; uinfo->count = 1; uinfo->value.integer.min = SND_BBFPRO_MIXER_VAL_MIN; uinfo->value.integer.max = SND_BBFPRO_MIXER_VAL_MAX; return 0; } static int snd_bbfpro_vol_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { int err; u16 idx; u32 new_val, old_value, uvalue; struct usb_mixer_elem_list *list = snd_kcontrol_chip(kcontrol); struct usb_mixer_interface *mixer = list->mixer; uvalue = ucontrol->value.integer.value[0]; idx = kcontrol->private_value & SND_BBFPRO_MIXER_IDX_MASK; old_value = kcontrol->private_value >> SND_BBFPRO_MIXER_VAL_SHIFT; if (uvalue > SND_BBFPRO_MIXER_VAL_MAX) return -EINVAL; if (uvalue == old_value) return 0; new_val = uvalue & SND_BBFPRO_MIXER_VAL_MASK; kcontrol->private_value = idx | (new_val << SND_BBFPRO_MIXER_VAL_SHIFT); err = snd_bbfpro_vol_update(mixer, idx, new_val); return err < 0 ? err : 1; } static int snd_bbfpro_vol_resume(struct usb_mixer_elem_list *list) { int pv = list->kctl->private_value; u16 idx = pv & SND_BBFPRO_MIXER_IDX_MASK; u32 val = (pv >> SND_BBFPRO_MIXER_VAL_SHIFT) & SND_BBFPRO_MIXER_VAL_MASK; return snd_bbfpro_vol_update(list->mixer, idx, val); } // Predfine elements static const struct snd_kcontrol_new snd_bbfpro_ctl_control = { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .access = SNDRV_CTL_ELEM_ACCESS_READWRITE, .index = 0, .info = snd_bbfpro_ctl_info, .get = snd_bbfpro_ctl_get, .put = snd_bbfpro_ctl_put }; static const struct snd_kcontrol_new snd_bbfpro_vol_control = { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .access = SNDRV_CTL_ELEM_ACCESS_READWRITE, .index = 0, .info = snd_bbfpro_vol_info, .get = snd_bbfpro_vol_get, .put = snd_bbfpro_vol_put }; static int snd_bbfpro_ctl_add(struct usb_mixer_interface *mixer, u8 reg, u8 index, char *name) { struct snd_kcontrol_new knew = snd_bbfpro_ctl_control; knew.name = name; knew.private_value = (reg & SND_BBFPRO_CTL_REG_MASK) | ((index & SND_BBFPRO_CTL_IDX_MASK) << SND_BBFPRO_CTL_IDX_SHIFT); return add_single_ctl_with_resume(mixer, 0, snd_bbfpro_ctl_resume, &knew, NULL); } static int snd_bbfpro_vol_add(struct usb_mixer_interface *mixer, u16 index, char *name) { struct snd_kcontrol_new knew = snd_bbfpro_vol_control; knew.name = name; knew.private_value = index & SND_BBFPRO_MIXER_IDX_MASK; return add_single_ctl_with_resume(mixer, 0, snd_bbfpro_vol_resume, &knew, NULL); } static int snd_bbfpro_controls_create(struct usb_mixer_interface *mixer) { int err, i, o; char name[48]; static const char * const input[] = { "AN1", "AN2", "IN3", "IN4", "AS1", "AS2", "ADAT3", "ADAT4", "ADAT5", "ADAT6", "ADAT7", "ADAT8"}; static const char * const output[] = { "AN1", "AN2", "PH3", "PH4", "AS1", "AS2", "ADAT3", "ADAT4", "ADAT5", "ADAT6", "ADAT7", "ADAT8"}; for (o = 0 ; o < 12 ; ++o) { for (i = 0 ; i < 12 ; ++i) { // Line routing snprintf(name, sizeof(name), "%s-%s-%s Playback Volume", (i < 2 ? "Mic" : "Line"), input[i], output[o]); err = snd_bbfpro_vol_add(mixer, (26 * o + i), name); if (err < 0) return err; // PCM routing... yes, it is output remapping snprintf(name, sizeof(name), "PCM-%s-%s Playback Volume", output[i], output[o]); err = snd_bbfpro_vol_add(mixer, (26 * o + 12 + i), name); if (err < 0) return err; } } // Control Reg 1 err = snd_bbfpro_ctl_add(mixer, SND_BBFPRO_CTL_REG1, SND_BBFPRO_CTL_REG1_CLK_OPTICAL, "Sample Clock Source"); if (err < 0) return err; err = snd_bbfpro_ctl_add(mixer, SND_BBFPRO_CTL_REG1, SND_BBFPRO_CTL_REG1_SPDIF_PRO, "IEC958 Pro Mask"); if (err < 0) return err; err = snd_bbfpro_ctl_add(mixer, SND_BBFPRO_CTL_REG1, SND_BBFPRO_CTL_REG1_SPDIF_EMPH, "IEC958 Emphasis"); if (err < 0) return err; err = snd_bbfpro_ctl_add(mixer, SND_BBFPRO_CTL_REG1, SND_BBFPRO_CTL_REG1_SPDIF_OPTICAL, "IEC958 Switch"); if (err < 0) return err; // Control Reg 2 err = snd_bbfpro_ctl_add(mixer, SND_BBFPRO_CTL_REG2, SND_BBFPRO_CTL_REG2_48V_AN1, "Mic-AN1 48V"); if (err < 0) return err; err = snd_bbfpro_ctl_add(mixer, SND_BBFPRO_CTL_REG2, SND_BBFPRO_CTL_REG2_48V_AN2, "Mic-AN2 48V"); if (err < 0) return err; err = snd_bbfpro_ctl_add(mixer, SND_BBFPRO_CTL_REG2, SND_BBFPRO_CTL_REG2_SENS_IN3, "Line-IN3 Sens."); if (err < 0) return err; err = snd_bbfpro_ctl_add(mixer, SND_BBFPRO_CTL_REG2, SND_BBFPRO_CTL_REG2_SENS_IN4, "Line-IN4 Sens."); if (err < 0) return err; err = snd_bbfpro_ctl_add(mixer, SND_BBFPRO_CTL_REG2, SND_BBFPRO_CTL_REG2_PAD_AN1, "Mic-AN1 PAD"); if (err < 0) return err; err = snd_bbfpro_ctl_add(mixer, SND_BBFPRO_CTL_REG2, SND_BBFPRO_CTL_REG2_PAD_AN2, "Mic-AN2 PAD"); if (err < 0) return err; return 0; } /* * Pioneer DJ DJM Mixers * * These devices generally have options for soft-switching the playback and * capture sources in addition to the recording level. Although different * devices have different configurations, there seems to be canonical values * for specific capture/playback types: See the definitions of these below. * * The wValue is masked with the stereo channel number. e.g. Setting Ch2 to * capture phono would be 0x0203. Capture, playback and capture level have * different wIndexes. */ // Capture types #define SND_DJM_CAP_LINE 0x00 #define SND_DJM_CAP_CDLINE 0x01 #define SND_DJM_CAP_DIGITAL 0x02 #define SND_DJM_CAP_PHONO 0x03 #define SND_DJM_CAP_PFADER 0x06 #define SND_DJM_CAP_XFADERA 0x07 #define SND_DJM_CAP_XFADERB 0x08 #define SND_DJM_CAP_MIC 0x09 #define SND_DJM_CAP_AUX 0x0d #define SND_DJM_CAP_RECOUT 0x0a #define SND_DJM_CAP_NONE 0x0f #define SND_DJM_CAP_CH1PFADER 0x11 #define SND_DJM_CAP_CH2PFADER 0x12 #define SND_DJM_CAP_CH3PFADER 0x13 #define SND_DJM_CAP_CH4PFADER 0x14 // Playback types #define SND_DJM_PB_CH1 0x00 #define SND_DJM_PB_CH2 0x01 #define SND_DJM_PB_AUX 0x04 #define SND_DJM_WINDEX_CAP 0x8002 #define SND_DJM_WINDEX_CAPLVL 0x8003 #define SND_DJM_WINDEX_PB 0x8016 // kcontrol->private_value layout #define SND_DJM_VALUE_MASK 0x0000ffff #define SND_DJM_GROUP_MASK 0x00ff0000 #define SND_DJM_DEVICE_MASK 0xff000000 #define SND_DJM_GROUP_SHIFT 16 #define SND_DJM_DEVICE_SHIFT 24 // device table index // used for the snd_djm_devices table, so please update accordingly #define SND_DJM_250MK2_IDX 0x0 #define SND_DJM_750_IDX 0x1 #define SND_DJM_850_IDX 0x2 #define SND_DJM_900NXS2_IDX 0x3 #define SND_DJM_750MK2_IDX 0x4 #define SND_DJM_450_IDX 0x5 #define SND_DJM_CTL(_name, suffix, _default_value, _windex) { \ .name = _name, \ .options = snd_djm_opts_##suffix, \ .noptions = ARRAY_SIZE(snd_djm_opts_##suffix), \ .default_value = _default_value, \ .wIndex = _windex } #define SND_DJM_DEVICE(suffix) { \ .controls = snd_djm_ctls_##suffix, \ .ncontrols = ARRAY_SIZE(snd_djm_ctls_##suffix) } struct snd_djm_device { const char *name; const struct snd_djm_ctl *controls; size_t ncontrols; }; struct snd_djm_ctl { const char *name; const u16 *options; size_t noptions; u16 default_value; u16 wIndex; }; static const char *snd_djm_get_label_caplevel(u16 wvalue) { switch (wvalue) { case 0x0000: return "-19dB"; case 0x0100: return "-15dB"; case 0x0200: return "-10dB"; case 0x0300: return "-5dB"; default: return NULL; } }; static const char *snd_djm_get_label_cap_common(u16 wvalue) { switch (wvalue & 0x00ff) { case SND_DJM_CAP_LINE: return "Control Tone LINE"; case SND_DJM_CAP_CDLINE: return "Control Tone CD/LINE"; case SND_DJM_CAP_DIGITAL: return "Control Tone DIGITAL"; case SND_DJM_CAP_PHONO: return "Control Tone PHONO"; case SND_DJM_CAP_PFADER: return "Post Fader"; case SND_DJM_CAP_XFADERA: return "Cross Fader A"; case SND_DJM_CAP_XFADERB: return "Cross Fader B"; case SND_DJM_CAP_MIC: return "Mic"; case SND_DJM_CAP_RECOUT: return "Rec Out"; case SND_DJM_CAP_AUX: return "Aux"; case SND_DJM_CAP_NONE: return "None"; case SND_DJM_CAP_CH1PFADER: return "Post Fader Ch1"; case SND_DJM_CAP_CH2PFADER: return "Post Fader Ch2"; case SND_DJM_CAP_CH3PFADER: return "Post Fader Ch3"; case SND_DJM_CAP_CH4PFADER: return "Post Fader Ch4"; default: return NULL; } }; // The DJM-850 has different values for CD/LINE and LINE capture // control options than the other DJM declared in this file. static const char *snd_djm_get_label_cap_850(u16 wvalue) { switch (wvalue & 0x00ff) { case 0x00: return "Control Tone CD/LINE"; case 0x01: return "Control Tone LINE"; default: return snd_djm_get_label_cap_common(wvalue); } }; static const char *snd_djm_get_label_cap(u8 device_idx, u16 wvalue) { switch (device_idx) { case SND_DJM_850_IDX: return snd_djm_get_label_cap_850(wvalue); default: return snd_djm_get_label_cap_common(wvalue); } }; static const char *snd_djm_get_label_pb(u16 wvalue) { switch (wvalue & 0x00ff) { case SND_DJM_PB_CH1: return "Ch1"; case SND_DJM_PB_CH2: return "Ch2"; case SND_DJM_PB_AUX: return "Aux"; default: return NULL; } }; static const char *snd_djm_get_label(u8 device_idx, u16 wvalue, u16 windex) { switch (windex) { case SND_DJM_WINDEX_CAPLVL: return snd_djm_get_label_caplevel(wvalue); case SND_DJM_WINDEX_CAP: return snd_djm_get_label_cap(device_idx, wvalue); case SND_DJM_WINDEX_PB: return snd_djm_get_label_pb(wvalue); default: return NULL; } }; // common DJM capture level option values static const u16 snd_djm_opts_cap_level[] = { 0x0000, 0x0100, 0x0200, 0x0300 }; // DJM-250MK2 static const u16 snd_djm_opts_250mk2_cap1[] = { 0x0103, 0x0100, 0x0106, 0x0107, 0x0108, 0x0109, 0x010d, 0x010a }; static const u16 snd_djm_opts_250mk2_cap2[] = { 0x0203, 0x0200, 0x0206, 0x0207, 0x0208, 0x0209, 0x020d, 0x020a }; static const u16 snd_djm_opts_250mk2_cap3[] = { 0x030a, 0x0311, 0x0312, 0x0307, 0x0308, 0x0309, 0x030d }; static const u16 snd_djm_opts_250mk2_pb1[] = { 0x0100, 0x0101, 0x0104 }; static const u16 snd_djm_opts_250mk2_pb2[] = { 0x0200, 0x0201, 0x0204 }; static const u16 snd_djm_opts_250mk2_pb3[] = { 0x0300, 0x0301, 0x0304 }; static const struct snd_djm_ctl snd_djm_ctls_250mk2[] = { SND_DJM_CTL("Capture Level", cap_level, 0, SND_DJM_WINDEX_CAPLVL), SND_DJM_CTL("Ch1 Input", 250mk2_cap1, 2, SND_DJM_WINDEX_CAP), SND_DJM_CTL("Ch2 Input", 250mk2_cap2, 2, SND_DJM_WINDEX_CAP), SND_DJM_CTL("Ch3 Input", 250mk2_cap3, 0, SND_DJM_WINDEX_CAP), SND_DJM_CTL("Ch1 Output", 250mk2_pb1, 0, SND_DJM_WINDEX_PB), SND_DJM_CTL("Ch2 Output", 250mk2_pb2, 1, SND_DJM_WINDEX_PB), SND_DJM_CTL("Ch3 Output", 250mk2_pb3, 2, SND_DJM_WINDEX_PB) }; // DJM-450 static const u16 snd_djm_opts_450_cap1[] = { 0x0103, 0x0100, 0x0106, 0x0107, 0x0108, 0x0109, 0x010d, 0x010a }; static const u16 snd_djm_opts_450_cap2[] = { 0x0203, 0x0200, 0x0206, 0x0207, 0x0208, 0x0209, 0x020d, 0x020a }; static const u16 snd_djm_opts_450_cap3[] = { 0x030a, 0x0311, 0x0312, 0x0307, 0x0308, 0x0309, 0x030d }; static const u16 snd_djm_opts_450_pb1[] = { 0x0100, 0x0101, 0x0104 }; static const u16 snd_djm_opts_450_pb2[] = { 0x0200, 0x0201, 0x0204 }; static const u16 snd_djm_opts_450_pb3[] = { 0x0300, 0x0301, 0x0304 }; static const struct snd_djm_ctl snd_djm_ctls_450[] = { SND_DJM_CTL("Capture Level", cap_level, 0, SND_DJM_WINDEX_CAPLVL), SND_DJM_CTL("Ch1 Input", 450_cap1, 2, SND_DJM_WINDEX_CAP), SND_DJM_CTL("Ch2 Input", 450_cap2, 2, SND_DJM_WINDEX_CAP), SND_DJM_CTL("Ch3 Input", 450_cap3, 0, SND_DJM_WINDEX_CAP), SND_DJM_CTL("Ch1 Output", 450_pb1, 0, SND_DJM_WINDEX_PB), SND_DJM_CTL("Ch2 Output", 450_pb2, 1, SND_DJM_WINDEX_PB), SND_DJM_CTL("Ch3 Output", 450_pb3, 2, SND_DJM_WINDEX_PB) }; // DJM-750 static const u16 snd_djm_opts_750_cap1[] = { 0x0101, 0x0103, 0x0106, 0x0107, 0x0108, 0x0109, 0x010a, 0x010f }; static const u16 snd_djm_opts_750_cap2[] = { 0x0200, 0x0201, 0x0206, 0x0207, 0x0208, 0x0209, 0x020a, 0x020f }; static const u16 snd_djm_opts_750_cap3[] = { 0x0300, 0x0301, 0x0306, 0x0307, 0x0308, 0x0309, 0x030a, 0x030f }; static const u16 snd_djm_opts_750_cap4[] = { 0x0401, 0x0403, 0x0406, 0x0407, 0x0408, 0x0409, 0x040a, 0x040f }; static const struct snd_djm_ctl snd_djm_ctls_750[] = { SND_DJM_CTL("Capture Level", cap_level, 0, SND_DJM_WINDEX_CAPLVL), SND_DJM_CTL("Ch1 Input", 750_cap1, 2, SND_DJM_WINDEX_CAP), SND_DJM_CTL("Ch2 Input", 750_cap2, 2, SND_DJM_WINDEX_CAP), SND_DJM_CTL("Ch3 Input", 750_cap3, 0, SND_DJM_WINDEX_CAP), SND_DJM_CTL("Ch4 Input", 750_cap4, 0, SND_DJM_WINDEX_CAP) }; // DJM-850 static const u16 snd_djm_opts_850_cap1[] = { 0x0100, 0x0103, 0x0106, 0x0107, 0x0108, 0x0109, 0x010a, 0x010f }; static const u16 snd_djm_opts_850_cap2[] = { 0x0200, 0x0201, 0x0206, 0x0207, 0x0208, 0x0209, 0x020a, 0x020f }; static const u16 snd_djm_opts_850_cap3[] = { 0x0300, 0x0301, 0x0306, 0x0307, 0x0308, 0x0309, 0x030a, 0x030f }; static const u16 snd_djm_opts_850_cap4[] = { 0x0400, 0x0403, 0x0406, 0x0407, 0x0408, 0x0409, 0x040a, 0x040f }; static const struct snd_djm_ctl snd_djm_ctls_850[] = { SND_DJM_CTL("Capture Level", cap_level, 0, SND_DJM_WINDEX_CAPLVL), SND_DJM_CTL("Ch1 Input", 850_cap1, 1, SND_DJM_WINDEX_CAP), SND_DJM_CTL("Ch2 Input", 850_cap2, 0, SND_DJM_WINDEX_CAP), SND_DJM_CTL("Ch3 Input", 850_cap3, 0, SND_DJM_WINDEX_CAP), SND_DJM_CTL("Ch4 Input", 850_cap4, 1, SND_DJM_WINDEX_CAP) }; // DJM-900NXS2 static const u16 snd_djm_opts_900nxs2_cap1[] = { 0x0100, 0x0102, 0x0103, 0x0106, 0x0107, 0x0108, 0x0109, 0x010a }; static const u16 snd_djm_opts_900nxs2_cap2[] = { 0x0200, 0x0202, 0x0203, 0x0206, 0x0207, 0x0208, 0x0209, 0x020a }; static const u16 snd_djm_opts_900nxs2_cap3[] = { 0x0300, 0x0302, 0x0303, 0x0306, 0x0307, 0x0308, 0x0309, 0x030a }; static const u16 snd_djm_opts_900nxs2_cap4[] = { 0x0400, 0x0402, 0x0403, 0x0406, 0x0407, 0x0408, 0x0409, 0x040a }; static const u16 snd_djm_opts_900nxs2_cap5[] = { 0x0507, 0x0508, 0x0509, 0x050a, 0x0511, 0x0512, 0x0513, 0x0514 }; static const struct snd_djm_ctl snd_djm_ctls_900nxs2[] = { SND_DJM_CTL("Capture Level", cap_level, 0, SND_DJM_WINDEX_CAPLVL), SND_DJM_CTL("Ch1 Input", 900nxs2_cap1, 2, SND_DJM_WINDEX_CAP), SND_DJM_CTL("Ch2 Input", 900nxs2_cap2, 2, SND_DJM_WINDEX_CAP), SND_DJM_CTL("Ch3 Input", 900nxs2_cap3, 2, SND_DJM_WINDEX_CAP), SND_DJM_CTL("Ch4 Input", 900nxs2_cap4, 2, SND_DJM_WINDEX_CAP), SND_DJM_CTL("Ch5 Input", 900nxs2_cap5, 3, SND_DJM_WINDEX_CAP) }; // DJM-750MK2 static const u16 snd_djm_opts_750mk2_cap1[] = { 0x0100, 0x0102, 0x0103, 0x0106, 0x0107, 0x0108, 0x0109, 0x010a }; static const u16 snd_djm_opts_750mk2_cap2[] = { 0x0200, 0x0202, 0x0203, 0x0206, 0x0207, 0x0208, 0x0209, 0x020a }; static const u16 snd_djm_opts_750mk2_cap3[] = { 0x0300, 0x0302, 0x0303, 0x0306, 0x0307, 0x0308, 0x0309, 0x030a }; static const u16 snd_djm_opts_750mk2_cap4[] = { 0x0400, 0x0402, 0x0403, 0x0406, 0x0407, 0x0408, 0x0409, 0x040a }; static const u16 snd_djm_opts_750mk2_cap5[] = { 0x0507, 0x0508, 0x0509, 0x050a, 0x0511, 0x0512, 0x0513, 0x0514 }; static const u16 snd_djm_opts_750mk2_pb1[] = { 0x0100, 0x0101, 0x0104 }; static const u16 snd_djm_opts_750mk2_pb2[] = { 0x0200, 0x0201, 0x0204 }; static const u16 snd_djm_opts_750mk2_pb3[] = { 0x0300, 0x0301, 0x0304 }; static const struct snd_djm_ctl snd_djm_ctls_750mk2[] = { SND_DJM_CTL("Capture Level", cap_level, 0, SND_DJM_WINDEX_CAPLVL), SND_DJM_CTL("Ch1 Input", 750mk2_cap1, 2, SND_DJM_WINDEX_CAP), SND_DJM_CTL("Ch2 Input", 750mk2_cap2, 2, SND_DJM_WINDEX_CAP), SND_DJM_CTL("Ch3 Input", 750mk2_cap3, 2, SND_DJM_WINDEX_CAP), SND_DJM_CTL("Ch4 Input", 750mk2_cap4, 2, SND_DJM_WINDEX_CAP), SND_DJM_CTL("Ch5 Input", 750mk2_cap5, 3, SND_DJM_WINDEX_CAP), SND_DJM_CTL("Ch1 Output", 750mk2_pb1, 0, SND_DJM_WINDEX_PB), SND_DJM_CTL("Ch2 Output", 750mk2_pb2, 1, SND_DJM_WINDEX_PB), SND_DJM_CTL("Ch3 Output", 750mk2_pb3, 2, SND_DJM_WINDEX_PB) }; static const struct snd_djm_device snd_djm_devices[] = { [SND_DJM_250MK2_IDX] = SND_DJM_DEVICE(250mk2), [SND_DJM_750_IDX] = SND_DJM_DEVICE(750), [SND_DJM_850_IDX] = SND_DJM_DEVICE(850), [SND_DJM_900NXS2_IDX] = SND_DJM_DEVICE(900nxs2), [SND_DJM_750MK2_IDX] = SND_DJM_DEVICE(750mk2), [SND_DJM_450_IDX] = SND_DJM_DEVICE(450), }; static int snd_djm_controls_info(struct snd_kcontrol *kctl, struct snd_ctl_elem_info *info) { unsigned long private_value = kctl->private_value; u8 device_idx = (private_value & SND_DJM_DEVICE_MASK) >> SND_DJM_DEVICE_SHIFT; u8 ctl_idx = (private_value & SND_DJM_GROUP_MASK) >> SND_DJM_GROUP_SHIFT; const struct snd_djm_device *device = &snd_djm_devices[device_idx]; const char *name; const struct snd_djm_ctl *ctl; size_t noptions; if (ctl_idx >= device->ncontrols) return -EINVAL; ctl = &device->controls[ctl_idx]; noptions = ctl->noptions; if (info->value.enumerated.item >= noptions) info->value.enumerated.item = noptions - 1; name = snd_djm_get_label(device_idx, ctl->options[info->value.enumerated.item], ctl->wIndex); if (!name) return -EINVAL; strscpy(info->value.enumerated.name, name, sizeof(info->value.enumerated.name)); info->type = SNDRV_CTL_ELEM_TYPE_ENUMERATED; info->count = 1; info->value.enumerated.items = noptions; return 0; } static int snd_djm_controls_update(struct usb_mixer_interface *mixer, u8 device_idx, u8 group, u16 value) { int err; const struct snd_djm_device *device = &snd_djm_devices[device_idx]; if ((group >= device->ncontrols) || value >= device->controls[group].noptions) return -EINVAL; err = snd_usb_lock_shutdown(mixer->chip); if (err) return err; err = snd_usb_ctl_msg( mixer->chip->dev, usb_sndctrlpipe(mixer->chip->dev, 0), USB_REQ_SET_FEATURE, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, device->controls[group].options[value], device->controls[group].wIndex, NULL, 0); snd_usb_unlock_shutdown(mixer->chip); return err; } static int snd_djm_controls_get(struct snd_kcontrol *kctl, struct snd_ctl_elem_value *elem) { elem->value.enumerated.item[0] = kctl->private_value & SND_DJM_VALUE_MASK; return 0; } static int snd_djm_controls_put(struct snd_kcontrol *kctl, struct snd_ctl_elem_value *elem) { struct usb_mixer_elem_list *list = snd_kcontrol_chip(kctl); struct usb_mixer_interface *mixer = list->mixer; unsigned long private_value = kctl->private_value; u8 device = (private_value & SND_DJM_DEVICE_MASK) >> SND_DJM_DEVICE_SHIFT; u8 group = (private_value & SND_DJM_GROUP_MASK) >> SND_DJM_GROUP_SHIFT; u16 value = elem->value.enumerated.item[0]; kctl->private_value = (((unsigned long)device << SND_DJM_DEVICE_SHIFT) | (group << SND_DJM_GROUP_SHIFT) | value); return snd_djm_controls_update(mixer, device, group, value); } static int snd_djm_controls_resume(struct usb_mixer_elem_list *list) { unsigned long private_value = list->kctl->private_value; u8 device = (private_value & SND_DJM_DEVICE_MASK) >> SND_DJM_DEVICE_SHIFT; u8 group = (private_value & SND_DJM_GROUP_MASK) >> SND_DJM_GROUP_SHIFT; u16 value = (private_value & SND_DJM_VALUE_MASK); return snd_djm_controls_update(list->mixer, device, group, value); } static int snd_djm_controls_create(struct usb_mixer_interface *mixer, const u8 device_idx) { int err, i; u16 value; const struct snd_djm_device *device = &snd_djm_devices[device_idx]; struct snd_kcontrol_new knew = { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .access = SNDRV_CTL_ELEM_ACCESS_READWRITE, .index = 0, .info = snd_djm_controls_info, .get = snd_djm_controls_get, .put = snd_djm_controls_put }; for (i = 0; i < device->ncontrols; i++) { value = device->controls[i].default_value; knew.name = device->controls[i].name; knew.private_value = ( ((unsigned long)device_idx << SND_DJM_DEVICE_SHIFT) | (i << SND_DJM_GROUP_SHIFT) | value); err = snd_djm_controls_update(mixer, device_idx, i, value); if (err) return err; err = add_single_ctl_with_resume(mixer, 0, snd_djm_controls_resume, &knew, NULL); if (err) return err; } return 0; } int snd_usb_mixer_apply_create_quirk(struct usb_mixer_interface *mixer) { int err = 0; err = snd_usb_soundblaster_remote_init(mixer); if (err < 0) return err; switch (mixer->chip->usb_id) { /* Tascam US-16x08 */ case USB_ID(0x0644, 0x8047): err = snd_us16x08_controls_create(mixer); break; case USB_ID(0x041e, 0x3020): case USB_ID(0x041e, 0x3040): case USB_ID(0x041e, 0x3042): case USB_ID(0x041e, 0x30df): case USB_ID(0x041e, 0x3048): err = snd_audigy2nx_controls_create(mixer); if (err < 0) break; snd_card_ro_proc_new(mixer->chip->card, "audigy2nx", mixer, snd_audigy2nx_proc_read); break; /* EMU0204 */ case USB_ID(0x041e, 0x3f19): err = snd_emu0204_controls_create(mixer); break; case USB_ID(0x0763, 0x2030): /* M-Audio Fast Track C400 */ case USB_ID(0x0763, 0x2031): /* M-Audio Fast Track C400 */ err = snd_c400_create_mixer(mixer); break; case USB_ID(0x0763, 0x2080): /* M-Audio Fast Track Ultra */ case USB_ID(0x0763, 0x2081): /* M-Audio Fast Track Ultra 8R */ err = snd_ftu_create_mixer(mixer); break; case USB_ID(0x0b05, 0x1739): /* ASUS Xonar U1 */ case USB_ID(0x0b05, 0x1743): /* ASUS Xonar U1 (2) */ case USB_ID(0x0b05, 0x17a0): /* ASUS Xonar U3 */ err = snd_xonar_u1_controls_create(mixer); break; case USB_ID(0x0d8c, 0x0103): /* Audio Advantage Micro II */ err = snd_microii_controls_create(mixer); break; case USB_ID(0x0dba, 0x1000): /* Digidesign Mbox 1 */ err = snd_mbox1_controls_create(mixer); break; case USB_ID(0x17cc, 0x1011): /* Traktor Audio 6 */ err = snd_nativeinstruments_create_mixer(mixer, snd_nativeinstruments_ta6_mixers, ARRAY_SIZE(snd_nativeinstruments_ta6_mixers)); break; case USB_ID(0x17cc, 0x1021): /* Traktor Audio 10 */ err = snd_nativeinstruments_create_mixer(mixer, snd_nativeinstruments_ta10_mixers, ARRAY_SIZE(snd_nativeinstruments_ta10_mixers)); break; case USB_ID(0x200c, 0x1018): /* Electrix Ebox-44 */ /* detection is disabled in mixer_maps.c */ err = snd_create_std_mono_table(mixer, ebox44_table); break; case USB_ID(0x1235, 0x8012): /* Focusrite Scarlett 6i6 */ case USB_ID(0x1235, 0x8002): /* Focusrite Scarlett 8i6 */ case USB_ID(0x1235, 0x8004): /* Focusrite Scarlett 18i6 */ case USB_ID(0x1235, 0x8014): /* Focusrite Scarlett 18i8 */ case USB_ID(0x1235, 0x800c): /* Focusrite Scarlett 18i20 */ err = snd_scarlett_controls_create(mixer); break; case USB_ID(0x1235, 0x8203): /* Focusrite Scarlett 6i6 2nd Gen */ case USB_ID(0x1235, 0x8204): /* Focusrite Scarlett 18i8 2nd Gen */ case USB_ID(0x1235, 0x8201): /* Focusrite Scarlett 18i20 2nd Gen */ case USB_ID(0x1235, 0x8211): /* Focusrite Scarlett Solo 3rd Gen */ case USB_ID(0x1235, 0x8210): /* Focusrite Scarlett 2i2 3rd Gen */ case USB_ID(0x1235, 0x8212): /* Focusrite Scarlett 4i4 3rd Gen */ case USB_ID(0x1235, 0x8213): /* Focusrite Scarlett 8i6 3rd Gen */ case USB_ID(0x1235, 0x8214): /* Focusrite Scarlett 18i8 3rd Gen */ case USB_ID(0x1235, 0x8215): /* Focusrite Scarlett 18i20 3rd Gen */ case USB_ID(0x1235, 0x8206): /* Focusrite Clarett 2Pre USB */ case USB_ID(0x1235, 0x8207): /* Focusrite Clarett 4Pre USB */ case USB_ID(0x1235, 0x8208): /* Focusrite Clarett 8Pre USB */ case USB_ID(0x1235, 0x820a): /* Focusrite Clarett+ 2Pre */ case USB_ID(0x1235, 0x820b): /* Focusrite Clarett+ 4Pre */ case USB_ID(0x1235, 0x820c): /* Focusrite Clarett+ 8Pre */ err = snd_scarlett2_init(mixer); break; case USB_ID(0x041e, 0x323b): /* Creative Sound Blaster E1 */ err = snd_soundblaster_e1_switch_create(mixer); break; case USB_ID(0x0bda, 0x4014): /* Dell WD15 dock */ err = dell_dock_mixer_create(mixer); if (err < 0) break; err = dell_dock_mixer_init(mixer); break; case USB_ID(0x0bda, 0x402e): /* Dell WD19 dock */ err = dell_dock_mixer_create(mixer); break; case USB_ID(0x2a39, 0x3fd2): /* RME ADI-2 Pro */ case USB_ID(0x2a39, 0x3fd3): /* RME ADI-2 DAC */ case USB_ID(0x2a39, 0x3fd4): /* RME */ err = snd_rme_controls_create(mixer); break; case USB_ID(0x194f, 0x010c): /* Presonus Studio 1810c */ err = snd_sc1810_init_mixer(mixer); break; case USB_ID(0x2a39, 0x3fb0): /* RME Babyface Pro FS */ err = snd_bbfpro_controls_create(mixer); break; case USB_ID(0x2b73, 0x0017): /* Pioneer DJ DJM-250MK2 */ err = snd_djm_controls_create(mixer, SND_DJM_250MK2_IDX); break; case USB_ID(0x2b73, 0x0013): /* Pioneer DJ DJM-450 */ err = snd_djm_controls_create(mixer, SND_DJM_450_IDX); break; case USB_ID(0x08e4, 0x017f): /* Pioneer DJ DJM-750 */ err = snd_djm_controls_create(mixer, SND_DJM_750_IDX); break; case USB_ID(0x2b73, 0x001b): /* Pioneer DJ DJM-750MK2 */ err = snd_djm_controls_create(mixer, SND_DJM_750MK2_IDX); break; case USB_ID(0x08e4, 0x0163): /* Pioneer DJ DJM-850 */ err = snd_djm_controls_create(mixer, SND_DJM_850_IDX); break; case USB_ID(0x2b73, 0x000a): /* Pioneer DJ DJM-900NXS2 */ err = snd_djm_controls_create(mixer, SND_DJM_900NXS2_IDX); break; } return err; } void snd_usb_mixer_resume_quirk(struct usb_mixer_interface *mixer) { switch (mixer->chip->usb_id) { case USB_ID(0x0bda, 0x4014): /* Dell WD15 dock */ dell_dock_mixer_init(mixer); break; } } void snd_usb_mixer_rc_memory_change(struct usb_mixer_interface *mixer, int unitid) { if (!mixer->rc_cfg) return; /* unit ids specific to Extigy/Audigy 2 NX: */ switch (unitid) { case 0: /* remote control */ mixer->rc_urb->dev = mixer->chip->dev; usb_submit_urb(mixer->rc_urb, GFP_ATOMIC); break; case 4: /* digital in jack */ case 7: /* line in jacks */ case 19: /* speaker out jacks */ case 20: /* headphones out jack */ break; /* live24ext: 4 = line-in jack */ case 3: /* hp-out jack (may actuate Mute) */ if (mixer->chip->usb_id == USB_ID(0x041e, 0x3040) || mixer->chip->usb_id == USB_ID(0x041e, 0x3048)) snd_usb_mixer_notify_id(mixer, mixer->rc_cfg->mute_mixer_id); break; default: usb_audio_dbg(mixer->chip, "memory change in unknown unit %d\n", unitid); break; } } static void snd_dragonfly_quirk_db_scale(struct usb_mixer_interface *mixer, struct usb_mixer_elem_info *cval, struct snd_kcontrol *kctl) { /* Approximation using 10 ranges based on output measurement on hw v1.2. * This seems close to the cubic mapping e.g. alsamixer uses. */ static const DECLARE_TLV_DB_RANGE(scale, 0, 1, TLV_DB_MINMAX_ITEM(-5300, -4970), 2, 5, TLV_DB_MINMAX_ITEM(-4710, -4160), 6, 7, TLV_DB_MINMAX_ITEM(-3884, -3710), 8, 14, TLV_DB_MINMAX_ITEM(-3443, -2560), 15, 16, TLV_DB_MINMAX_ITEM(-2475, -2324), 17, 19, TLV_DB_MINMAX_ITEM(-2228, -2031), 20, 26, TLV_DB_MINMAX_ITEM(-1910, -1393), 27, 31, TLV_DB_MINMAX_ITEM(-1322, -1032), 32, 40, TLV_DB_MINMAX_ITEM(-968, -490), 41, 50, TLV_DB_MINMAX_ITEM(-441, 0), ); if (cval->min == 0 && cval->max == 50) { usb_audio_info(mixer->chip, "applying DragonFly dB scale quirk (0-50 variant)\n"); kctl->tlv.p = scale; kctl->vd[0].access |= SNDRV_CTL_ELEM_ACCESS_TLV_READ; kctl->vd[0].access &= ~SNDRV_CTL_ELEM_ACCESS_TLV_CALLBACK; } else if (cval->min == 0 && cval->max <= 1000) { /* Some other clearly broken DragonFly variant. * At least a 0..53 variant (hw v1.0) exists. */ usb_audio_info(mixer->chip, "ignoring too narrow dB range on a DragonFly device"); kctl->vd[0].access &= ~SNDRV_CTL_ELEM_ACCESS_TLV_CALLBACK; } } void snd_usb_mixer_fu_apply_quirk(struct usb_mixer_interface *mixer, struct usb_mixer_elem_info *cval, int unitid, struct snd_kcontrol *kctl) { switch (mixer->chip->usb_id) { case USB_ID(0x21b4, 0x0081): /* AudioQuest DragonFly */ if (unitid == 7 && cval->control == UAC_FU_VOLUME) snd_dragonfly_quirk_db_scale(mixer, cval, kctl); break; /* lowest playback value is muted on some devices */ case USB_ID(0x0d8c, 0x000c): /* C-Media */ case USB_ID(0x0d8c, 0x0014): /* C-Media */ case USB_ID(0x19f7, 0x0003): /* RODE NT-USB */ if (strstr(kctl->id.name, "Playback")) cval->min_mute = 1; break; } } |
2 2 124 125 76 3 48 437 3 9 47 14 127 3 3 7 52 4 61 63 1 52 89 4 17 3 9 6 5 3 3 3 1 220 125 1278 125 689 800 14 14 14 14 14 14 89 89 2 1989 66 1983 100 1985 7 1991 3 14 14 1985 1993 621 37 712 627 98 689 626 65 23 45 809 792 72 13 796 50 800 150 29 125 133 133 41 127 14 2 6 3 1 3 3 6 15 11 9 9 6 3 47 6 2 3 2 5 1631 136 1006 142 30 58 148 3 6 57 22 48 220 437 3 63 1355 1265 165 159 921 921 187 47 60 559 605 918 923 921 413 642 35 35 283 284 284 84 11 48 6 8 7 12 3 8 4 62 14 50 366 366 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/export.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/if_vlan.h> #include <linux/filter.h> #include <net/dsa.h> #include <net/dst_metadata.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/gre.h> #include <net/pptp.h> #include <net/tipc.h> #include <linux/igmp.h> #include <linux/icmp.h> #include <linux/sctp.h> #include <linux/dccp.h> #include <linux/if_tunnel.h> #include <linux/if_pppox.h> #include <linux/ppp_defs.h> #include <linux/stddef.h> #include <linux/if_ether.h> #include <linux/if_hsr.h> #include <linux/mpls.h> #include <linux/tcp.h> #include <linux/ptp_classify.h> #include <net/flow_dissector.h> #include <scsi/fc/fc_fcoe.h> #include <uapi/linux/batadv_packet.h> #include <linux/bpf.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_labels.h> #endif #include <linux/bpf-netns.h> static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) { flow_dissector->used_keys |= (1 << key_id); } void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count) { unsigned int i; memset(flow_dissector, 0, sizeof(*flow_dissector)); for (i = 0; i < key_count; i++, key++) { /* User should make sure that every key target offset is within * boundaries of unsigned short. */ BUG_ON(key->offset > USHRT_MAX); BUG_ON(dissector_uses_key(flow_dissector, key->key_id)); dissector_set_key(flow_dissector, key->key_id); flow_dissector->offset[key->key_id] = key->offset; } /* Ensure that the dissector always includes control and basic key. * That way we are able to avoid handling lack of these in fast path. */ BUG_ON(!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CONTROL)); BUG_ON(!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_BASIC)); } EXPORT_SYMBOL(skb_flow_dissector_init); #ifdef CONFIG_BPF_SYSCALL int flow_dissector_bpf_prog_attach_check(struct net *net, struct bpf_prog *prog) { enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; if (net == &init_net) { /* BPF flow dissector in the root namespace overrides * any per-net-namespace one. When attaching to root, * make sure we don't have any BPF program attached * to the non-root namespaces. */ struct net *ns; for_each_net(ns) { if (ns == &init_net) continue; if (rcu_access_pointer(ns->bpf.run_array[type])) return -EEXIST; } } else { /* Make sure root flow dissector is not attached * when attaching to the non-root namespace. */ if (rcu_access_pointer(init_net.bpf.run_array[type])) return -EEXIST; } return 0; } #endif /* CONFIG_BPF_SYSCALL */ /** * __skb_flow_get_ports - extract the upper layer ports and return them * @skb: sk_buff to extract the ports from * @thoff: transport header offset * @ip_proto: protocol for which to get port offset * @data: raw buffer pointer to the packet, if NULL use skb->data * @hlen: packet header length, if @data is NULL use skb_headlen(skb) * * The function will try to retrieve the ports at offset thoff + poff where poff * is the protocol port offset returned from proto_ports_offset */ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, const void *data, int hlen) { int poff = proto_ports_offset(ip_proto); if (!data) { data = skb->data; hlen = skb_headlen(skb); } if (poff >= 0) { __be32 *ports, _ports; ports = __skb_header_pointer(skb, thoff + poff, sizeof(_ports), data, hlen, &_ports); if (ports) return *ports; } return 0; } EXPORT_SYMBOL(__skb_flow_get_ports); static bool icmp_has_id(u8 type) { switch (type) { case ICMP_ECHO: case ICMP_ECHOREPLY: case ICMP_TIMESTAMP: case ICMP_TIMESTAMPREPLY: case ICMPV6_ECHO_REQUEST: case ICMPV6_ECHO_REPLY: return true; } return false; } /** * skb_flow_get_icmp_tci - extract ICMP(6) Type, Code and Identifier fields * @skb: sk_buff to extract from * @key_icmp: struct flow_dissector_key_icmp to fill * @data: raw buffer pointer to the packet * @thoff: offset to extract at * @hlen: packet header length */ void skb_flow_get_icmp_tci(const struct sk_buff *skb, struct flow_dissector_key_icmp *key_icmp, const void *data, int thoff, int hlen) { struct icmphdr *ih, _ih; ih = __skb_header_pointer(skb, thoff, sizeof(_ih), data, hlen, &_ih); if (!ih) return; key_icmp->type = ih->type; key_icmp->code = ih->code; /* As we use 0 to signal that the Id field is not present, * avoid confusion with packets without such field */ if (icmp_has_id(ih->type)) key_icmp->id = ih->un.echo.id ? ntohs(ih->un.echo.id) : 1; else key_icmp->id = 0; } EXPORT_SYMBOL(skb_flow_get_icmp_tci); /* If FLOW_DISSECTOR_KEY_ICMP is set, dissect an ICMP packet * using skb_flow_get_icmp_tci(). */ static void __skb_flow_dissect_icmp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int thoff, int hlen) { struct flow_dissector_key_icmp *key_icmp; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ICMP)) return; key_icmp = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ICMP, target_container); skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen); } static void __skb_flow_dissect_l2tpv3(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen) { struct flow_dissector_key_l2tpv3 *key_l2tpv3; struct { __be32 session_id; } *hdr, _hdr; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_L2TPV3)) return; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return; key_l2tpv3 = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_L2TPV3, target_container); key_l2tpv3->session_id = hdr->session_id; } void skb_flow_dissect_meta(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container) { struct flow_dissector_key_meta *meta; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_META)) return; meta = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_META, target_container); meta->ingress_ifindex = skb->skb_iif; } EXPORT_SYMBOL(skb_flow_dissect_meta); static void skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type, struct flow_dissector *flow_dissector, void *target_container) { struct flow_dissector_key_control *ctrl; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL)) return; ctrl = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL, target_container); ctrl->addr_type = type; } void skb_flow_dissect_ct(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, u16 *ctinfo_map, size_t mapsize, bool post_ct, u16 zone) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) struct flow_dissector_key_ct *key; enum ip_conntrack_info ctinfo; struct nf_conn_labels *cl; struct nf_conn *ct; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CT)) return; ct = nf_ct_get(skb, &ctinfo); if (!ct && !post_ct) return; key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CT, target_container); if (!ct) { key->ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | TCA_FLOWER_KEY_CT_FLAGS_INVALID; key->ct_zone = zone; return; } if (ctinfo < mapsize) key->ct_state = ctinfo_map[ctinfo]; #if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) key->ct_zone = ct->zone.id; #endif #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) key->ct_mark = READ_ONCE(ct->mark); #endif cl = nf_ct_labels_find(ct); if (cl) memcpy(key->ct_labels, cl->bits, sizeof(key->ct_labels)); #endif /* CONFIG_NF_CONNTRACK */ } EXPORT_SYMBOL(skb_flow_dissect_ct); void skb_flow_dissect_tunnel_info(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container) { struct ip_tunnel_info *info; struct ip_tunnel_key *key; /* A quick check to see if there might be something to do. */ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) return; info = skb_tunnel_info(skb); if (!info) return; key = &info->key; switch (ip_tunnel_info_af(info)) { case AF_INET: skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV4_ADDRS, flow_dissector, target_container); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) { struct flow_dissector_key_ipv4_addrs *ipv4; ipv4 = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, target_container); ipv4->src = key->u.ipv4.src; ipv4->dst = key->u.ipv4.dst; } break; case AF_INET6: skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV6_ADDRS, flow_dissector, target_container); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) { struct flow_dissector_key_ipv6_addrs *ipv6; ipv6 = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, target_container); ipv6->src = key->u.ipv6.src; ipv6->dst = key->u.ipv6.dst; } break; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) { struct flow_dissector_key_keyid *keyid; keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID, target_container); keyid->keyid = tunnel_id_to_key32(key->tun_id); } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) { struct flow_dissector_key_ports *tp; tp = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS, target_container); tp->src = key->tp_src; tp->dst = key->tp_dst; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP)) { struct flow_dissector_key_ip *ip; ip = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP, target_container); ip->tos = key->tos; ip->ttl = key->ttl; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) { struct flow_dissector_key_enc_opts *enc_opt; enc_opt = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS, target_container); if (info->options_len) { enc_opt->len = info->options_len; ip_tunnel_info_opts_get(enc_opt->data, info); enc_opt->dst_opt_type = info->key.tun_flags & TUNNEL_OPTIONS_PRESENT; } } } EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); void skb_flow_dissect_hash(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container) { struct flow_dissector_key_hash *key; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_HASH)) return; key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_HASH, target_container); key->hash = skb_get_hash_raw(skb); } EXPORT_SYMBOL(skb_flow_dissect_hash); static enum flow_dissect_ret __skb_flow_dissect_mpls(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen, int lse_index, bool *entropy_label) { struct mpls_label *hdr, _hdr; u32 entry, label, bos; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) return FLOW_DISSECT_RET_OUT_GOOD; if (lse_index >= FLOW_DIS_MPLS_MAX) return FLOW_DISSECT_RET_OUT_GOOD; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return FLOW_DISSECT_RET_OUT_BAD; entry = ntohl(hdr->entry); label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT; bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) { struct flow_dissector_key_mpls *key_mpls; struct flow_dissector_mpls_lse *lse; key_mpls = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_MPLS, target_container); lse = &key_mpls->ls[lse_index]; lse->mpls_ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; lse->mpls_bos = bos; lse->mpls_tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT; lse->mpls_label = label; dissector_set_mpls_lse(key_mpls, lse_index); } if (*entropy_label && dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) { struct flow_dissector_key_keyid *key_keyid; key_keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY, target_container); key_keyid->keyid = cpu_to_be32(label); } *entropy_label = label == MPLS_LABEL_ENTROPY; return bos ? FLOW_DISSECT_RET_OUT_GOOD : FLOW_DISSECT_RET_PROTO_AGAIN; } static enum flow_dissect_ret __skb_flow_dissect_arp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen) { struct flow_dissector_key_arp *key_arp; struct { unsigned char ar_sha[ETH_ALEN]; unsigned char ar_sip[4]; unsigned char ar_tha[ETH_ALEN]; unsigned char ar_tip[4]; } *arp_eth, _arp_eth; const struct arphdr *arp; struct arphdr _arp; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ARP)) return FLOW_DISSECT_RET_OUT_GOOD; arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data, hlen, &_arp); if (!arp) return FLOW_DISSECT_RET_OUT_BAD; if (arp->ar_hrd != htons(ARPHRD_ETHER) || arp->ar_pro != htons(ETH_P_IP) || arp->ar_hln != ETH_ALEN || arp->ar_pln != 4 || (arp->ar_op != htons(ARPOP_REPLY) && arp->ar_op != htons(ARPOP_REQUEST))) return FLOW_DISSECT_RET_OUT_BAD; arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp), sizeof(_arp_eth), data, hlen, &_arp_eth); if (!arp_eth) return FLOW_DISSECT_RET_OUT_BAD; key_arp = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ARP, target_container); memcpy(&key_arp->sip, arp_eth->ar_sip, sizeof(key_arp->sip)); memcpy(&key_arp->tip, arp_eth->ar_tip, sizeof(key_arp->tip)); /* Only store the lower byte of the opcode; * this covers ARPOP_REPLY and ARPOP_REQUEST. */ key_arp->op = ntohs(arp->ar_op) & 0xff; ether_addr_copy(key_arp->sha, arp_eth->ar_sha); ether_addr_copy(key_arp->tha, arp_eth->ar_tha); return FLOW_DISSECT_RET_OUT_GOOD; } static enum flow_dissect_ret __skb_flow_dissect_gre(const struct sk_buff *skb, struct flow_dissector_key_control *key_control, struct flow_dissector *flow_dissector, void *target_container, const void *data, __be16 *p_proto, int *p_nhoff, int *p_hlen, unsigned int flags) { struct flow_dissector_key_keyid *key_keyid; struct gre_base_hdr *hdr, _hdr; int offset = 0; u16 gre_ver; hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), data, *p_hlen, &_hdr); if (!hdr) return FLOW_DISSECT_RET_OUT_BAD; /* Only look inside GRE without routing */ if (hdr->flags & GRE_ROUTING) return FLOW_DISSECT_RET_OUT_GOOD; /* Only look inside GRE for version 0 and 1 */ gre_ver = ntohs(hdr->flags & GRE_VERSION); if (gre_ver > 1) return FLOW_DISSECT_RET_OUT_GOOD; *p_proto = hdr->protocol; if (gre_ver) { /* Version1 must be PPTP, and check the flags */ if (!(*p_proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY))) return FLOW_DISSECT_RET_OUT_GOOD; } offset += sizeof(struct gre_base_hdr); if (hdr->flags & GRE_CSUM) offset += sizeof_field(struct gre_full_hdr, csum) + sizeof_field(struct gre_full_hdr, reserved1); if (hdr->flags & GRE_KEY) { const __be32 *keyid; __be32 _keyid; keyid = __skb_header_pointer(skb, *p_nhoff + offset, sizeof(_keyid), data, *p_hlen, &_keyid); if (!keyid) return FLOW_DISSECT_RET_OUT_BAD; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_GRE_KEYID)) { key_keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_GRE_KEYID, target_container); if (gre_ver == 0) key_keyid->keyid = *keyid; else key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK; } offset += sizeof_field(struct gre_full_hdr, key); } if (hdr->flags & GRE_SEQ) offset += sizeof_field(struct pptp_gre_header, seq); if (gre_ver == 0) { if (*p_proto == htons(ETH_P_TEB)) { const struct ethhdr *eth; struct ethhdr _eth; eth = __skb_header_pointer(skb, *p_nhoff + offset, sizeof(_eth), data, *p_hlen, &_eth); if (!eth) return FLOW_DISSECT_RET_OUT_BAD; *p_proto = eth->h_proto; offset += sizeof(*eth); /* Cap headers that we access via pointers at the * end of the Ethernet header as our maximum alignment * at that point is only 2 bytes. */ if (NET_IP_ALIGN) *p_hlen = *p_nhoff + offset; } } else { /* version 1, must be PPTP */ u8 _ppp_hdr[PPP_HDRLEN]; u8 *ppp_hdr; if (hdr->flags & GRE_ACK) offset += sizeof_field(struct pptp_gre_header, ack); ppp_hdr = __skb_header_pointer(skb, *p_nhoff + offset, sizeof(_ppp_hdr), data, *p_hlen, _ppp_hdr); if (!ppp_hdr) return FLOW_DISSECT_RET_OUT_BAD; switch (PPP_PROTOCOL(ppp_hdr)) { case PPP_IP: *p_proto = htons(ETH_P_IP); break; case PPP_IPV6: *p_proto = htons(ETH_P_IPV6); break; default: /* Could probably catch some more like MPLS */ break; } offset += PPP_HDRLEN; } *p_nhoff += offset; key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) return FLOW_DISSECT_RET_OUT_GOOD; return FLOW_DISSECT_RET_PROTO_AGAIN; } /** * __skb_flow_dissect_batadv() - dissect batman-adv header * @skb: sk_buff to with the batman-adv header * @key_control: flow dissectors control key * @data: raw buffer pointer to the packet, if NULL use skb->data * @p_proto: pointer used to update the protocol to process next * @p_nhoff: pointer used to update inner network header offset * @hlen: packet header length * @flags: any combination of FLOW_DISSECTOR_F_* * * ETH_P_BATMAN packets are tried to be dissected. Only * &struct batadv_unicast packets are actually processed because they contain an * inner ethernet header and are usually followed by actual network header. This * allows the flow dissector to continue processing the packet. * * Return: FLOW_DISSECT_RET_PROTO_AGAIN when &struct batadv_unicast was found, * FLOW_DISSECT_RET_OUT_GOOD when dissector should stop after encapsulation, * otherwise FLOW_DISSECT_RET_OUT_BAD */ static enum flow_dissect_ret __skb_flow_dissect_batadv(const struct sk_buff *skb, struct flow_dissector_key_control *key_control, const void *data, __be16 *p_proto, int *p_nhoff, int hlen, unsigned int flags) { struct { struct batadv_unicast_packet batadv_unicast; struct ethhdr eth; } *hdr, _hdr; hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return FLOW_DISSECT_RET_OUT_BAD; if (hdr->batadv_unicast.version != BATADV_COMPAT_VERSION) return FLOW_DISSECT_RET_OUT_BAD; if (hdr->batadv_unicast.packet_type != BATADV_UNICAST) return FLOW_DISSECT_RET_OUT_BAD; *p_proto = hdr->eth.h_proto; *p_nhoff += sizeof(*hdr); key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) return FLOW_DISSECT_RET_OUT_GOOD; return FLOW_DISSECT_RET_PROTO_AGAIN; } static void __skb_flow_dissect_tcp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int thoff, int hlen) { struct flow_dissector_key_tcp *key_tcp; struct tcphdr *th, _th; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_TCP)) return; th = __skb_header_pointer(skb, thoff, sizeof(_th), data, hlen, &_th); if (!th) return; if (unlikely(__tcp_hdrlen(th) < sizeof(_th))) return; key_tcp = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_TCP, target_container); key_tcp->flags = (*(__be16 *) &tcp_flag_word(th) & htons(0x0FFF)); } static void __skb_flow_dissect_ports(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, u8 ip_proto, int hlen) { enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX; struct flow_dissector_key_ports *key_ports; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) dissector_ports = FLOW_DISSECTOR_KEY_PORTS; else if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE)) dissector_ports = FLOW_DISSECTOR_KEY_PORTS_RANGE; if (dissector_ports == FLOW_DISSECTOR_KEY_MAX) return; key_ports = skb_flow_dissector_target(flow_dissector, dissector_ports, target_container); key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen); } static void __skb_flow_dissect_ipv4(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, const struct iphdr *iph) { struct flow_dissector_key_ip *key_ip; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP)) return; key_ip = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IP, target_container); key_ip->tos = iph->tos; key_ip->ttl = iph->ttl; } static void __skb_flow_dissect_ipv6(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, const struct ipv6hdr *iph) { struct flow_dissector_key_ip *key_ip; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP)) return; key_ip = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IP, target_container); key_ip->tos = ipv6_get_dsfield(iph); key_ip->ttl = iph->hop_limit; } /* Maximum number of protocol headers that can be parsed in * __skb_flow_dissect */ #define MAX_FLOW_DISSECT_HDRS 15 static bool skb_flow_dissect_allowed(int *num_hdrs) { ++*num_hdrs; return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS); } static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, struct flow_dissector *flow_dissector, void *target_container) { struct flow_dissector_key_ports *key_ports = NULL; struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_tags *key_tags; key_control = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CONTROL, target_container); key_control->thoff = flow_keys->thoff; if (flow_keys->is_frag) key_control->flags |= FLOW_DIS_IS_FRAGMENT; if (flow_keys->is_first_frag) key_control->flags |= FLOW_DIS_FIRST_FRAG; if (flow_keys->is_encap) key_control->flags |= FLOW_DIS_ENCAPSULATION; key_basic = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_BASIC, target_container); key_basic->n_proto = flow_keys->n_proto; key_basic->ip_proto = flow_keys->ip_proto; if (flow_keys->addr_proto == ETH_P_IP && dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container); key_addrs->v4addrs.src = flow_keys->ipv4_src; key_addrs->v4addrs.dst = flow_keys->ipv4_dst; key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } else if (flow_keys->addr_proto == ETH_P_IPV6 && dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS, target_container); memcpy(&key_addrs->v6addrs.src, &flow_keys->ipv6_src, sizeof(key_addrs->v6addrs.src)); memcpy(&key_addrs->v6addrs.dst, &flow_keys->ipv6_dst, sizeof(key_addrs->v6addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); else if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE)) key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE, target_container); if (key_ports) { key_ports->src = flow_keys->sport; key_ports->dst = flow_keys->dport; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL)) { key_tags = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL, target_container); key_tags->flow_label = ntohl(flow_keys->flow_label); } } u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, __be16 proto, int nhoff, int hlen, unsigned int flags) { struct bpf_flow_keys *flow_keys = ctx->flow_keys; u32 result; /* Pass parameters to the BPF program */ memset(flow_keys, 0, sizeof(*flow_keys)); flow_keys->n_proto = proto; flow_keys->nhoff = nhoff; flow_keys->thoff = flow_keys->nhoff; BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG != (int)FLOW_DISSECTOR_F_PARSE_1ST_FRAG); BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL != (int)FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP != (int)FLOW_DISSECTOR_F_STOP_AT_ENCAP); flow_keys->flags = flags; result = bpf_prog_run_pin_on_cpu(prog, ctx); flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen); flow_keys->thoff = clamp_t(u16, flow_keys->thoff, flow_keys->nhoff, hlen); return result; } static bool is_pppoe_ses_hdr_valid(const struct pppoe_hdr *hdr) { return hdr->ver == 1 && hdr->type == 1 && hdr->code == 0; } /** * __skb_flow_dissect - extract the flow_keys struct and return it * @net: associated network namespace, derived from @skb if NULL * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified * @flow_dissector: list of keys to dissect * @target_container: target structure to put dissected values into * @data: raw buffer pointer to the packet, if NULL use skb->data * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb) * @hlen: packet header length, if @data is NULL use skb_headlen(skb) * @flags: flags that control the dissection process, e.g. * FLOW_DISSECTOR_F_STOP_AT_ENCAP. * * The function will try to retrieve individual keys into target specified * by flow_dissector from either the skbuff or a raw buffer specified by the * rest parameters. * * Caller must take care of zeroing target container memory. */ bool __skb_flow_dissect(const struct net *net, const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, __be16 proto, int nhoff, int hlen, unsigned int flags) { struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; enum flow_dissect_ret fdret; enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; bool mpls_el = false; int mpls_lse = 0; int num_hdrs = 0; u8 ip_proto = 0; bool ret; if (!data) { data = skb->data; proto = skb_vlan_tag_present(skb) ? skb->vlan_proto : skb->protocol; nhoff = skb_network_offset(skb); hlen = skb_headlen(skb); #if IS_ENABLED(CONFIG_NET_DSA) if (unlikely(skb->dev && netdev_uses_dsa(skb->dev) && proto == htons(ETH_P_XDSA))) { const struct dsa_device_ops *ops; int offset = 0; ops = skb->dev->dsa_ptr->tag_ops; /* Only DSA header taggers break flow dissection */ if (ops->needed_headroom) { if (ops->flow_dissect) ops->flow_dissect(skb, &proto, &offset); else dsa_tag_generic_flow_dissect(skb, &proto, &offset); hlen -= offset; nhoff += offset; } } #endif } /* It is ensured by skb_flow_dissector_init() that control key will * be always present. */ key_control = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CONTROL, target_container); /* It is ensured by skb_flow_dissector_init() that basic key will * be always present. */ key_basic = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_BASIC, target_container); if (skb) { if (!net) { if (skb->dev) net = dev_net(skb->dev); else if (skb->sk) net = sock_net(skb->sk); } } DEBUG_NET_WARN_ON_ONCE(!net); if (net) { enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; struct bpf_prog_array *run_array; rcu_read_lock(); run_array = rcu_dereference(init_net.bpf.run_array[type]); if (!run_array) run_array = rcu_dereference(net->bpf.run_array[type]); if (run_array) { struct bpf_flow_keys flow_keys; struct bpf_flow_dissector ctx = { .flow_keys = &flow_keys, .data = data, .data_end = data + hlen, }; __be16 n_proto = proto; struct bpf_prog *prog; u32 result; if (skb) { ctx.skb = skb; /* we can't use 'proto' in the skb case * because it might be set to skb->vlan_proto * which has been pulled from the data */ n_proto = skb->protocol; } prog = READ_ONCE(run_array->items[0].prog); result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, hlen, flags); if (result == BPF_FLOW_DISSECTOR_CONTINUE) goto dissect_continue; __skb_flow_bpf_to_target(&flow_keys, flow_dissector, target_container); rcu_read_unlock(); return result == BPF_OK; } dissect_continue: rcu_read_unlock(); } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); struct flow_dissector_key_eth_addrs *key_eth_addrs; key_eth_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS, target_container); memcpy(key_eth_addrs, eth, sizeof(*key_eth_addrs)); } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS)) { struct flow_dissector_key_num_of_vlans *key_num_of_vlans; key_num_of_vlans = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS, target_container); key_num_of_vlans->num_of_vlans = 0; } proto_again: fdret = FLOW_DISSECT_RET_CONTINUE; switch (proto) { case htons(ETH_P_IP): { const struct iphdr *iph; struct iphdr _iph; iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph || iph->ihl < 5) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } nhoff += iph->ihl * 4; ip_proto = iph->protocol; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container); memcpy(&key_addrs->v4addrs.src, &iph->saddr, sizeof(key_addrs->v4addrs.src)); memcpy(&key_addrs->v4addrs.dst, &iph->daddr, sizeof(key_addrs->v4addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } __skb_flow_dissect_ipv4(skb, flow_dissector, target_container, data, iph); if (ip_is_fragment(iph)) { key_control->flags |= FLOW_DIS_IS_FRAGMENT; if (iph->frag_off & htons(IP_OFFSET)) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } else { key_control->flags |= FLOW_DIS_FIRST_FRAG; if (!(flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } } } break; } case htons(ETH_P_IPV6): { const struct ipv6hdr *iph; struct ipv6hdr _iph; iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } ip_proto = iph->nexthdr; nhoff += sizeof(struct ipv6hdr); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS, target_container); memcpy(&key_addrs->v6addrs.src, &iph->saddr, sizeof(key_addrs->v6addrs.src)); memcpy(&key_addrs->v6addrs.dst, &iph->daddr, sizeof(key_addrs->v6addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } if ((dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL) || (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)) && ip6_flowlabel(iph)) { __be32 flow_label = ip6_flowlabel(iph); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL)) { key_tags = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL, target_container); key_tags->flow_label = ntohl(flow_label); } if (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } } __skb_flow_dissect_ipv6(skb, flow_dissector, target_container, data, iph); break; } case htons(ETH_P_8021AD): case htons(ETH_P_8021Q): { const struct vlan_hdr *vlan = NULL; struct vlan_hdr _vlan; __be16 saved_vlan_tpid = proto; if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX && skb && skb_vlan_tag_present(skb)) { proto = skb->protocol; } else { vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan); if (!vlan) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } proto = vlan->h_vlan_encapsulated_proto; nhoff += sizeof(*vlan); } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS) && !(key_control->flags & FLOW_DIS_ENCAPSULATION)) { struct flow_dissector_key_num_of_vlans *key_nvs; key_nvs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS, target_container); key_nvs->num_of_vlans++; } if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX) { dissector_vlan = FLOW_DISSECTOR_KEY_VLAN; } else if (dissector_vlan == FLOW_DISSECTOR_KEY_VLAN) { dissector_vlan = FLOW_DISSECTOR_KEY_CVLAN; } else { fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; } if (dissector_uses_key(flow_dissector, dissector_vlan)) { key_vlan = skb_flow_dissector_target(flow_dissector, dissector_vlan, target_container); if (!vlan) { key_vlan->vlan_id = skb_vlan_tag_get_id(skb); key_vlan->vlan_priority = skb_vlan_tag_get_prio(skb); } else { key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) & VLAN_VID_MASK; key_vlan->vlan_priority = (ntohs(vlan->h_vlan_TCI) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } key_vlan->vlan_tpid = saved_vlan_tpid; key_vlan->vlan_eth_type = proto; } fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; } case htons(ETH_P_PPP_SES): { struct { struct pppoe_hdr hdr; __be16 proto; } *hdr, _hdr; u16 ppp_proto; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } if (!is_pppoe_ses_hdr_valid(&hdr->hdr)) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } /* least significant bit of the most significant octet * indicates if protocol field was compressed */ ppp_proto = ntohs(hdr->proto); if (ppp_proto & 0x0100) { ppp_proto = ppp_proto >> 8; nhoff += PPPOE_SES_HLEN - 1; } else { nhoff += PPPOE_SES_HLEN; } if (ppp_proto == PPP_IP) { proto = htons(ETH_P_IP); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; } else if (ppp_proto == PPP_IPV6) { proto = htons(ETH_P_IPV6); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; } else if (ppp_proto == PPP_MPLS_UC) { proto = htons(ETH_P_MPLS_UC); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; } else if (ppp_proto == PPP_MPLS_MC) { proto = htons(ETH_P_MPLS_MC); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; } else if (ppp_proto_is_valid(ppp_proto)) { fdret = FLOW_DISSECT_RET_OUT_GOOD; } else { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PPPOE)) { struct flow_dissector_key_pppoe *key_pppoe; key_pppoe = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PPPOE, target_container); key_pppoe->session_id = hdr->hdr.sid; key_pppoe->ppp_proto = htons(ppp_proto); key_pppoe->type = htons(ETH_P_PPP_SES); } break; } case htons(ETH_P_TIPC): { struct tipc_basic_hdr *hdr, _hdr; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_TIPC)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_TIPC, target_container); key_addrs->tipckey.key = tipc_hdr_rps_key(hdr); key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC; } fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } case htons(ETH_P_MPLS_UC): case htons(ETH_P_MPLS_MC): fdret = __skb_flow_dissect_mpls(skb, flow_dissector, target_container, data, nhoff, hlen, mpls_lse, &mpls_el); nhoff += sizeof(struct mpls_label); mpls_lse++; break; case htons(ETH_P_FCOE): if ((hlen - nhoff) < FCOE_HEADER_LEN) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } nhoff += FCOE_HEADER_LEN; fdret = FLOW_DISSECT_RET_OUT_GOOD; break; case htons(ETH_P_ARP): case htons(ETH_P_RARP): fdret = __skb_flow_dissect_arp(skb, flow_dissector, target_container, data, nhoff, hlen); break; case htons(ETH_P_BATMAN): fdret = __skb_flow_dissect_batadv(skb, key_control, data, &proto, &nhoff, hlen, flags); break; case htons(ETH_P_1588): { struct ptp_header *hdr, _hdr; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } nhoff += sizeof(struct ptp_header); fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } case htons(ETH_P_PRP): case htons(ETH_P_HSR): { struct hsr_tag *hdr, _hdr; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } proto = hdr->encap_proto; nhoff += HSR_HLEN; fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; } default: fdret = FLOW_DISSECT_RET_OUT_BAD; break; } /* Process result of proto processing */ switch (fdret) { case FLOW_DISSECT_RET_OUT_GOOD: goto out_good; case FLOW_DISSECT_RET_PROTO_AGAIN: if (skb_flow_dissect_allowed(&num_hdrs)) goto proto_again; goto out_good; case FLOW_DISSECT_RET_CONTINUE: case FLOW_DISSECT_RET_IPPROTO_AGAIN: break; case FLOW_DISSECT_RET_OUT_BAD: default: goto out_bad; } ip_proto_again: fdret = FLOW_DISSECT_RET_CONTINUE; switch (ip_proto) { case IPPROTO_GRE: if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } fdret = __skb_flow_dissect_gre(skb, key_control, flow_dissector, target_container, data, &proto, &nhoff, &hlen, flags); break; case NEXTHDR_HOP: case NEXTHDR_ROUTING: case NEXTHDR_DEST: { u8 _opthdr[2], *opthdr; if (proto != htons(ETH_P_IPV6)) break; opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr), data, hlen, &_opthdr); if (!opthdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } ip_proto = opthdr[0]; nhoff += (opthdr[1] + 1) << 3; fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN; break; } case NEXTHDR_FRAGMENT: { struct frag_hdr _fh, *fh; if (proto != htons(ETH_P_IPV6)) break; fh = __skb_header_pointer(skb, nhoff, sizeof(_fh), data, hlen, &_fh); if (!fh) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } key_control->flags |= FLOW_DIS_IS_FRAGMENT; nhoff += sizeof(_fh); ip_proto = fh->nexthdr; if (!(fh->frag_off & htons(IP6_OFFSET))) { key_control->flags |= FLOW_DIS_FIRST_FRAG; if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) { fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN; break; } } fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } case IPPROTO_IPIP: if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } proto = htons(ETH_P_IP); key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; case IPPROTO_IPV6: if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } proto = htons(ETH_P_IPV6); key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; case IPPROTO_MPLS: proto = htons(ETH_P_MPLS_UC); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; case IPPROTO_TCP: __skb_flow_dissect_tcp(skb, flow_dissector, target_container, data, nhoff, hlen); break; case IPPROTO_ICMP: case IPPROTO_ICMPV6: __skb_flow_dissect_icmp(skb, flow_dissector, target_container, data, nhoff, hlen); break; case IPPROTO_L2TP: __skb_flow_dissect_l2tpv3(skb, flow_dissector, target_container, data, nhoff, hlen); break; default: break; } if (!(key_control->flags & FLOW_DIS_IS_FRAGMENT)) __skb_flow_dissect_ports(skb, flow_dissector, target_container, data, nhoff, ip_proto, hlen); /* Process result of IP proto processing */ switch (fdret) { case FLOW_DISSECT_RET_PROTO_AGAIN: if (skb_flow_dissect_allowed(&num_hdrs)) goto proto_again; break; case FLOW_DISSECT_RET_IPPROTO_AGAIN: if (skb_flow_dissect_allowed(&num_hdrs)) goto ip_proto_again; break; case FLOW_DISSECT_RET_OUT_GOOD: case FLOW_DISSECT_RET_CONTINUE: break; case FLOW_DISSECT_RET_OUT_BAD: default: goto out_bad; } out_good: ret = true; out: key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen); key_basic->n_proto = proto; key_basic->ip_proto = ip_proto; return ret; out_bad: ret = false; goto out; } EXPORT_SYMBOL(__skb_flow_dissect); static siphash_aligned_key_t hashrnd; static __always_inline void __flow_hash_secret_init(void) { net_get_random_once(&hashrnd, sizeof(hashrnd)); } static const void *flow_keys_hash_start(const struct flow_keys *flow) { BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % SIPHASH_ALIGNMENT); return &flow->FLOW_KEYS_HASH_START_FIELD; } static inline size_t flow_keys_hash_length(const struct flow_keys *flow) { size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs); BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32)); switch (flow->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: diff -= sizeof(flow->addrs.v4addrs); break; case FLOW_DISSECTOR_KEY_IPV6_ADDRS: diff -= sizeof(flow->addrs.v6addrs); break; case FLOW_DISSECTOR_KEY_TIPC: diff -= sizeof(flow->addrs.tipckey); break; } return sizeof(*flow) - diff; } __be32 flow_get_u32_src(const struct flow_keys *flow) { switch (flow->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: return flow->addrs.v4addrs.src; case FLOW_DISSECTOR_KEY_IPV6_ADDRS: return (__force __be32)ipv6_addr_hash( &flow->addrs.v6addrs.src); case FLOW_DISSECTOR_KEY_TIPC: return flow->addrs.tipckey.key; default: return 0; } } EXPORT_SYMBOL(flow_get_u32_src); __be32 flow_get_u32_dst(const struct flow_keys *flow) { switch (flow->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: return flow->addrs.v4addrs.dst; case FLOW_DISSECTOR_KEY_IPV6_ADDRS: return (__force __be32)ipv6_addr_hash( &flow->addrs.v6addrs.dst); default: return 0; } } EXPORT_SYMBOL(flow_get_u32_dst); /* Sort the source and destination IP and the ports, * to have consistent hash within the two directions */ static inline void __flow_hash_consistentify(struct flow_keys *keys) { int addr_diff, i; switch (keys->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: if ((__force u32)keys->addrs.v4addrs.dst < (__force u32)keys->addrs.v4addrs.src) swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst); if ((__force u16)keys->ports.dst < (__force u16)keys->ports.src) { swap(keys->ports.src, keys->ports.dst); } break; case FLOW_DISSECTOR_KEY_IPV6_ADDRS: addr_diff = memcmp(&keys->addrs.v6addrs.dst, &keys->addrs.v6addrs.src, sizeof(keys->addrs.v6addrs.dst)); if (addr_diff < 0) { for (i = 0; i < 4; i++) swap(keys->addrs.v6addrs.src.s6_addr32[i], keys->addrs.v6addrs.dst.s6_addr32[i]); } if ((__force u16)keys->ports.dst < (__force u16)keys->ports.src) { swap(keys->ports.src, keys->ports.dst); } break; } } static inline u32 __flow_hash_from_keys(struct flow_keys *keys, const siphash_key_t *keyval) { u32 hash; __flow_hash_consistentify(keys); hash = siphash(flow_keys_hash_start(keys), flow_keys_hash_length(keys), keyval); if (!hash) hash = 1; return hash; } u32 flow_hash_from_keys(struct flow_keys *keys) { __flow_hash_secret_init(); return __flow_hash_from_keys(keys, &hashrnd); } EXPORT_SYMBOL(flow_hash_from_keys); static inline u32 ___skb_get_hash(const struct sk_buff *skb, struct flow_keys *keys, const siphash_key_t *keyval) { skb_flow_dissect_flow_keys(skb, keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); return __flow_hash_from_keys(keys, keyval); } struct _flow_keys_digest_data { __be16 n_proto; u8 ip_proto; u8 padding; __be32 ports; __be32 src; __be32 dst; }; void make_flow_keys_digest(struct flow_keys_digest *digest, const struct flow_keys *flow) { struct _flow_keys_digest_data *data = (struct _flow_keys_digest_data *)digest; BUILD_BUG_ON(sizeof(*data) > sizeof(*digest)); memset(digest, 0, sizeof(*digest)); data->n_proto = flow->basic.n_proto; data->ip_proto = flow->basic.ip_proto; data->ports = flow->ports.ports; data->src = flow->addrs.v4addrs.src; data->dst = flow->addrs.v4addrs.dst; } EXPORT_SYMBOL(make_flow_keys_digest); static struct flow_dissector flow_keys_dissector_symmetric __read_mostly; u32 __skb_get_hash_symmetric(const struct sk_buff *skb) { struct flow_keys keys; __flow_hash_secret_init(); memset(&keys, 0, sizeof(keys)); __skb_flow_dissect(NULL, skb, &flow_keys_dissector_symmetric, &keys, NULL, 0, 0, 0, 0); return __flow_hash_from_keys(&keys, &hashrnd); } EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric); /** * __skb_get_hash: calculate a flow hash * @skb: sk_buff to calculate flow hash from * * This function calculates a flow hash based on src/dst addresses * and src/dst port numbers. Sets hash in skb to non-zero hash value * on success, zero indicates no valid hash. Also, sets l4_hash in skb * if hash is a canonical 4-tuple hash over transport ports. */ void __skb_get_hash(struct sk_buff *skb) { struct flow_keys keys; u32 hash; __flow_hash_secret_init(); hash = ___skb_get_hash(skb, &keys, &hashrnd); __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); } EXPORT_SYMBOL(__skb_get_hash); __u32 skb_get_hash_perturb(const struct sk_buff *skb, const siphash_key_t *perturb) { struct flow_keys keys; return ___skb_get_hash(skb, &keys, perturb); } EXPORT_SYMBOL(skb_get_hash_perturb); u32 __skb_get_poff(const struct sk_buff *skb, const void *data, const struct flow_keys_basic *keys, int hlen) { u32 poff = keys->control.thoff; /* skip L4 headers for fragments after the first */ if ((keys->control.flags & FLOW_DIS_IS_FRAGMENT) && !(keys->control.flags & FLOW_DIS_FIRST_FRAG)) return poff; switch (keys->basic.ip_proto) { case IPPROTO_TCP: { /* access doff as u8 to avoid unaligned access */ const u8 *doff; u8 _doff; doff = __skb_header_pointer(skb, poff + 12, sizeof(_doff), data, hlen, &_doff); if (!doff) return poff; poff += max_t(u32, sizeof(struct tcphdr), (*doff & 0xF0) >> 2); break; } case IPPROTO_UDP: case IPPROTO_UDPLITE: poff += sizeof(struct udphdr); break; /* For the rest, we do not really care about header * extensions at this point for now. */ case IPPROTO_ICMP: poff += sizeof(struct icmphdr); break; case IPPROTO_ICMPV6: poff += sizeof(struct icmp6hdr); break; case IPPROTO_IGMP: poff += sizeof(struct igmphdr); break; case IPPROTO_DCCP: poff += sizeof(struct dccp_hdr); break; case IPPROTO_SCTP: poff += sizeof(struct sctphdr); break; } return poff; } /** * skb_get_poff - get the offset to the payload * @skb: sk_buff to get the payload offset from * * The function will get the offset to the payload as far as it could * be dissected. The main user is currently BPF, so that we can dynamically * truncate packets without needing to push actual payload to the user * space and can analyze headers only, instead. */ u32 skb_get_poff(const struct sk_buff *skb) { struct flow_keys_basic keys; if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys, NULL, 0, 0, 0, 0)) return 0; return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); } __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys) { memset(keys, 0, sizeof(*keys)); memcpy(&keys->addrs.v6addrs.src, &fl6->saddr, sizeof(keys->addrs.v6addrs.src)); memcpy(&keys->addrs.v6addrs.dst, &fl6->daddr, sizeof(keys->addrs.v6addrs.dst)); keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; keys->ports.src = fl6->fl6_sport; keys->ports.dst = fl6->fl6_dport; keys->keyid.keyid = fl6->fl6_gre_key; keys->tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); keys->basic.ip_proto = fl6->flowi6_proto; return flow_hash_from_keys(keys); } EXPORT_SYMBOL(__get_hash_from_flowi6); static const struct flow_dissector_key flow_keys_dissector_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, .offset = offsetof(struct flow_keys, control), }, { .key_id = FLOW_DISSECTOR_KEY_BASIC, .offset = offsetof(struct flow_keys, basic), }, { .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, .offset = offsetof(struct flow_keys, addrs.v4addrs), }, { .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, .offset = offsetof(struct flow_keys, addrs.v6addrs), }, { .key_id = FLOW_DISSECTOR_KEY_TIPC, .offset = offsetof(struct flow_keys, addrs.tipckey), }, { .key_id = FLOW_DISSECTOR_KEY_PORTS, .offset = offsetof(struct flow_keys, ports), }, { .key_id = FLOW_DISSECTOR_KEY_VLAN, .offset = offsetof(struct flow_keys, vlan), }, { .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL, .offset = offsetof(struct flow_keys, tags), }, { .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID, .offset = offsetof(struct flow_keys, keyid), }, }; static const struct flow_dissector_key flow_keys_dissector_symmetric_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, .offset = offsetof(struct flow_keys, control), }, { .key_id = FLOW_DISSECTOR_KEY_BASIC, .offset = offsetof(struct flow_keys, basic), }, { .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, .offset = offsetof(struct flow_keys, addrs.v4addrs), }, { .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, .offset = offsetof(struct flow_keys, addrs.v6addrs), }, { .key_id = FLOW_DISSECTOR_KEY_PORTS, .offset = offsetof(struct flow_keys, ports), }, }; static const struct flow_dissector_key flow_keys_basic_dissector_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, .offset = offsetof(struct flow_keys, control), }, { .key_id = FLOW_DISSECTOR_KEY_BASIC, .offset = offsetof(struct flow_keys, basic), }, }; struct flow_dissector flow_keys_dissector __read_mostly; EXPORT_SYMBOL(flow_keys_dissector); struct flow_dissector flow_keys_basic_dissector __read_mostly; EXPORT_SYMBOL(flow_keys_basic_dissector); static int __init init_default_flow_dissectors(void) { skb_flow_dissector_init(&flow_keys_dissector, flow_keys_dissector_keys, ARRAY_SIZE(flow_keys_dissector_keys)); skb_flow_dissector_init(&flow_keys_dissector_symmetric, flow_keys_dissector_symmetric_keys, ARRAY_SIZE(flow_keys_dissector_symmetric_keys)); skb_flow_dissector_init(&flow_keys_basic_dissector, flow_keys_basic_dissector_keys, ARRAY_SIZE(flow_keys_basic_dissector_keys)); return 0; } core_initcall(init_default_flow_dissectors); |
1 1 16 19 23 17 24 24 15 6 20 17 9 10 4 5 5 15 14 17 1 1 487 469 12 7 14 14 2 10 1 3 4 5 13 11 10 3 11 10 1 11 23 17 16 15 17 13 17 17 3 2 9 3 1 3 1 4 1 1 2 1 1 1 1 2 2 14 14 12 2 41 39 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 | /* * 8253/8254 interval timer emulation * * Copyright (c) 2003-2004 Fabrice Bellard * Copyright (c) 2006 Intel Corporation * Copyright (c) 2007 Keir Fraser, XenSource Inc * Copyright (c) 2008 Intel Corporation * Copyright 2009 Red Hat, Inc. and/or its affiliates. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. * * Authors: * Sheng Yang <sheng.yang@intel.com> * Based on QEMU and Xen. */ #define pr_fmt(fmt) "pit: " fmt #include <linux/kvm_host.h> #include <linux/slab.h> #include "ioapic.h" #include "irq.h" #include "i8254.h" #include "x86.h" #ifndef CONFIG_X86_64 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) #else #define mod_64(x, y) ((x) % (y)) #endif #define RW_STATE_LSB 1 #define RW_STATE_MSB 2 #define RW_STATE_WORD0 3 #define RW_STATE_WORD1 4 static void pit_set_gate(struct kvm_pit *pit, int channel, u32 val) { struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel]; switch (c->mode) { default: case 0: case 4: /* XXX: just disable/enable counting */ break; case 1: case 2: case 3: case 5: /* Restart counting on rising edge. */ if (c->gate < val) c->count_load_time = ktime_get(); break; } c->gate = val; } static int pit_get_gate(struct kvm_pit *pit, int channel) { return pit->pit_state.channels[channel].gate; } static s64 __kpit_elapsed(struct kvm_pit *pit) { s64 elapsed; ktime_t remaining; struct kvm_kpit_state *ps = &pit->pit_state; if (!ps->period) return 0; /* * The Counter does not stop when it reaches zero. In * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to * the highest count, either FFFF hex for binary counting * or 9999 for BCD counting, and continues counting. * Modes 2 and 3 are periodic; the Counter reloads * itself with the initial count and continues counting * from there. */ remaining = hrtimer_get_remaining(&ps->timer); elapsed = ps->period - ktime_to_ns(remaining); return elapsed; } static s64 kpit_elapsed(struct kvm_pit *pit, struct kvm_kpit_channel_state *c, int channel) { if (channel == 0) return __kpit_elapsed(pit); return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); } static int pit_get_count(struct kvm_pit *pit, int channel) { struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel]; s64 d, t; int counter; t = kpit_elapsed(pit, c, channel); d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC); switch (c->mode) { case 0: case 1: case 4: case 5: counter = (c->count - d) & 0xffff; break; case 3: /* XXX: may be incorrect for odd counts */ counter = c->count - (mod_64((2 * d), c->count)); break; default: counter = c->count - mod_64(d, c->count); break; } return counter; } static int pit_get_out(struct kvm_pit *pit, int channel) { struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel]; s64 d, t; int out; t = kpit_elapsed(pit, c, channel); d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC); switch (c->mode) { default: case 0: out = (d >= c->count); break; case 1: out = (d < c->count); break; case 2: out = ((mod_64(d, c->count) == 0) && (d != 0)); break; case 3: out = (mod_64(d, c->count) < ((c->count + 1) >> 1)); break; case 4: case 5: out = (d == c->count); break; } return out; } static void pit_latch_count(struct kvm_pit *pit, int channel) { struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel]; if (!c->count_latched) { c->latched_count = pit_get_count(pit, channel); c->count_latched = c->rw_mode; } } static void pit_latch_status(struct kvm_pit *pit, int channel) { struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel]; if (!c->status_latched) { /* TODO: Return NULL COUNT (bit 6). */ c->status = ((pit_get_out(pit, channel) << 7) | (c->rw_mode << 4) | (c->mode << 1) | c->bcd); c->status_latched = 1; } } static inline struct kvm_pit *pit_state_to_pit(struct kvm_kpit_state *ps) { return container_of(ps, struct kvm_pit, pit_state); } static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) { struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, irq_ack_notifier); struct kvm_pit *pit = pit_state_to_pit(ps); atomic_set(&ps->irq_ack, 1); /* irq_ack should be set before pending is read. Order accesses with * inc(pending) in pit_timer_fn and xchg(irq_ack, 0) in pit_do_work. */ smp_mb(); if (atomic_dec_if_positive(&ps->pending) > 0) kthread_queue_work(pit->worker, &pit->expired); } void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) { struct kvm_pit *pit = vcpu->kvm->arch.vpit; struct hrtimer *timer; /* Somewhat arbitrarily make vcpu0 the owner of the PIT. */ if (vcpu->vcpu_id || !pit) return; timer = &pit->pit_state.timer; mutex_lock(&pit->pit_state.lock); if (hrtimer_cancel(timer)) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); mutex_unlock(&pit->pit_state.lock); } static void destroy_pit_timer(struct kvm_pit *pit) { hrtimer_cancel(&pit->pit_state.timer); kthread_flush_work(&pit->expired); } static void pit_do_work(struct kthread_work *work) { struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); struct kvm *kvm = pit->kvm; struct kvm_vcpu *vcpu; unsigned long i; struct kvm_kpit_state *ps = &pit->pit_state; if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0)) return; kvm_set_irq(kvm, pit->irq_source_id, 0, 1, false); kvm_set_irq(kvm, pit->irq_source_id, 0, 0, false); /* * Provides NMI watchdog support via Virtual Wire mode. * The route is: PIT -> LVT0 in NMI mode. * * Note: Our Virtual Wire implementation does not follow * the MP specification. We propagate a PIT interrupt to all * VCPUs and only when LVT0 is in NMI mode. The interrupt can * also be simultaneously delivered through PIC and IOAPIC. */ if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0) kvm_for_each_vcpu(i, vcpu, kvm) kvm_apic_nmi_wd_deliver(vcpu); } static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) { struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer); struct kvm_pit *pt = pit_state_to_pit(ps); if (atomic_read(&ps->reinject)) atomic_inc(&ps->pending); kthread_queue_work(pt->worker, &pt->expired); if (ps->is_periodic) { hrtimer_add_expires_ns(&ps->timer, ps->period); return HRTIMER_RESTART; } else return HRTIMER_NORESTART; } static inline void kvm_pit_reset_reinject(struct kvm_pit *pit) { atomic_set(&pit->pit_state.pending, 0); atomic_set(&pit->pit_state.irq_ack, 1); } void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject) { struct kvm_kpit_state *ps = &pit->pit_state; struct kvm *kvm = pit->kvm; if (atomic_read(&ps->reinject) == reinject) return; /* * AMD SVM AVIC accelerates EOI write and does not trap. * This cause in-kernel PIT re-inject mode to fail * since it checks ps->irq_ack before kvm_set_irq() * and relies on the ack notifier to timely queue * the pt->worker work iterm and reinject the missed tick. * So, deactivate APICv when PIT is in reinject mode. */ if (reinject) { kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PIT_REINJ); /* The initial state is preserved while ps->reinject == 0. */ kvm_pit_reset_reinject(pit); kvm_register_irq_ack_notifier(kvm, &ps->irq_ack_notifier); kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); } else { kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PIT_REINJ); kvm_unregister_irq_ack_notifier(kvm, &ps->irq_ack_notifier); kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); } atomic_set(&ps->reinject, reinject); } static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period) { struct kvm_kpit_state *ps = &pit->pit_state; struct kvm *kvm = pit->kvm; s64 interval; if (!ioapic_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) return; interval = mul_u64_u32_div(val, NSEC_PER_SEC, KVM_PIT_FREQ); pr_debug("create pit timer, interval is %llu nsec\n", interval); /* TODO The new value only affected after the retriggered */ hrtimer_cancel(&ps->timer); kthread_flush_work(&pit->expired); ps->period = interval; ps->is_periodic = is_period; kvm_pit_reset_reinject(pit); /* * Do not allow the guest to program periodic timers with small * interval, since the hrtimers are not throttled by the host * scheduler. */ if (ps->is_periodic) { s64 min_period = min_timer_period_us * 1000LL; if (ps->period < min_period) { pr_info_ratelimited( "kvm: requested %lld ns " "i8254 timer period limited to %lld ns\n", ps->period, min_period); ps->period = min_period; } } hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval), HRTIMER_MODE_ABS); } static void pit_load_count(struct kvm_pit *pit, int channel, u32 val) { struct kvm_kpit_state *ps = &pit->pit_state; pr_debug("load_count val is %u, channel is %d\n", val, channel); /* * The largest possible initial count is 0; this is equivalent * to 216 for binary counting and 104 for BCD counting. */ if (val == 0) val = 0x10000; ps->channels[channel].count = val; if (channel != 0) { ps->channels[channel].count_load_time = ktime_get(); return; } /* Two types of timer * mode 1 is one shot, mode 2 is period, otherwise del timer */ switch (ps->channels[0].mode) { case 0: case 1: /* FIXME: enhance mode 4 precision */ case 4: create_pit_timer(pit, val, 0); break; case 2: case 3: create_pit_timer(pit, val, 1); break; default: destroy_pit_timer(pit); } } void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val, int hpet_legacy_start) { u8 saved_mode; WARN_ON_ONCE(!mutex_is_locked(&pit->pit_state.lock)); if (hpet_legacy_start) { /* save existing mode for later reenablement */ WARN_ON(channel != 0); saved_mode = pit->pit_state.channels[0].mode; pit->pit_state.channels[0].mode = 0xff; /* disable timer */ pit_load_count(pit, channel, val); pit->pit_state.channels[0].mode = saved_mode; } else { pit_load_count(pit, channel, val); } } static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev) { return container_of(dev, struct kvm_pit, dev); } static inline struct kvm_pit *speaker_to_pit(struct kvm_io_device *dev) { return container_of(dev, struct kvm_pit, speaker_dev); } static inline int pit_in_range(gpa_t addr) { return ((addr >= KVM_PIT_BASE_ADDRESS) && (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); } static int pit_ioport_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, int len, const void *data) { struct kvm_pit *pit = dev_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; int channel, access; struct kvm_kpit_channel_state *s; u32 val = *(u32 *) data; if (!pit_in_range(addr)) return -EOPNOTSUPP; val &= 0xff; addr &= KVM_PIT_CHANNEL_MASK; mutex_lock(&pit_state->lock); if (val != 0) pr_debug("write addr is 0x%x, len is %d, val is 0x%x\n", (unsigned int)addr, len, val); if (addr == 3) { channel = val >> 6; if (channel == 3) { /* Read-Back Command. */ for (channel = 0; channel < 3; channel++) { if (val & (2 << channel)) { if (!(val & 0x20)) pit_latch_count(pit, channel); if (!(val & 0x10)) pit_latch_status(pit, channel); } } } else { /* Select Counter <channel>. */ s = &pit_state->channels[channel]; access = (val >> 4) & KVM_PIT_CHANNEL_MASK; if (access == 0) { pit_latch_count(pit, channel); } else { s->rw_mode = access; s->read_state = access; s->write_state = access; s->mode = (val >> 1) & 7; if (s->mode > 5) s->mode -= 4; s->bcd = val & 1; } } } else { /* Write Count. */ s = &pit_state->channels[addr]; switch (s->write_state) { default: case RW_STATE_LSB: pit_load_count(pit, addr, val); break; case RW_STATE_MSB: pit_load_count(pit, addr, val << 8); break; case RW_STATE_WORD0: s->write_latch = val; s->write_state = RW_STATE_WORD1; break; case RW_STATE_WORD1: pit_load_count(pit, addr, s->write_latch | (val << 8)); s->write_state = RW_STATE_WORD0; break; } } mutex_unlock(&pit_state->lock); return 0; } static int pit_ioport_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, int len, void *data) { struct kvm_pit *pit = dev_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; int ret, count; struct kvm_kpit_channel_state *s; if (!pit_in_range(addr)) return -EOPNOTSUPP; addr &= KVM_PIT_CHANNEL_MASK; if (addr == 3) return 0; s = &pit_state->channels[addr]; mutex_lock(&pit_state->lock); if (s->status_latched) { s->status_latched = 0; ret = s->status; } else if (s->count_latched) { switch (s->count_latched) { default: case RW_STATE_LSB: ret = s->latched_count & 0xff; s->count_latched = 0; break; case RW_STATE_MSB: ret = s->latched_count >> 8; s->count_latched = 0; break; case RW_STATE_WORD0: ret = s->latched_count & 0xff; s->count_latched = RW_STATE_MSB; break; } } else { switch (s->read_state) { default: case RW_STATE_LSB: count = pit_get_count(pit, addr); ret = count & 0xff; break; case RW_STATE_MSB: count = pit_get_count(pit, addr); ret = (count >> 8) & 0xff; break; case RW_STATE_WORD0: count = pit_get_count(pit, addr); ret = count & 0xff; s->read_state = RW_STATE_WORD1; break; case RW_STATE_WORD1: count = pit_get_count(pit, addr); ret = (count >> 8) & 0xff; s->read_state = RW_STATE_WORD0; break; } } if (len > sizeof(ret)) len = sizeof(ret); memcpy(data, (char *)&ret, len); mutex_unlock(&pit_state->lock); return 0; } static int speaker_ioport_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, int len, const void *data) { struct kvm_pit *pit = speaker_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; u32 val = *(u32 *) data; if (addr != KVM_SPEAKER_BASE_ADDRESS) return -EOPNOTSUPP; mutex_lock(&pit_state->lock); if (val & (1 << 1)) pit_state->flags |= KVM_PIT_FLAGS_SPEAKER_DATA_ON; else pit_state->flags &= ~KVM_PIT_FLAGS_SPEAKER_DATA_ON; pit_set_gate(pit, 2, val & 1); mutex_unlock(&pit_state->lock); return 0; } static int speaker_ioport_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, int len, void *data) { struct kvm_pit *pit = speaker_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; unsigned int refresh_clock; int ret; if (addr != KVM_SPEAKER_BASE_ADDRESS) return -EOPNOTSUPP; /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */ refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1; mutex_lock(&pit_state->lock); ret = (!!(pit_state->flags & KVM_PIT_FLAGS_SPEAKER_DATA_ON) << 1) | pit_get_gate(pit, 2) | (pit_get_out(pit, 2) << 5) | (refresh_clock << 4); if (len > sizeof(ret)) len = sizeof(ret); memcpy(data, (char *)&ret, len); mutex_unlock(&pit_state->lock); return 0; } static void kvm_pit_reset(struct kvm_pit *pit) { int i; struct kvm_kpit_channel_state *c; pit->pit_state.flags = 0; for (i = 0; i < 3; i++) { c = &pit->pit_state.channels[i]; c->mode = 0xff; c->gate = (i != 2); pit_load_count(pit, i, 0); } kvm_pit_reset_reinject(pit); } static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask) { struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier); if (!mask) kvm_pit_reset_reinject(pit); } static const struct kvm_io_device_ops pit_dev_ops = { .read = pit_ioport_read, .write = pit_ioport_write, }; static const struct kvm_io_device_ops speaker_dev_ops = { .read = speaker_ioport_read, .write = speaker_ioport_write, }; struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) { struct kvm_pit *pit; struct kvm_kpit_state *pit_state; struct pid *pid; pid_t pid_nr; int ret; pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL_ACCOUNT); if (!pit) return NULL; pit->irq_source_id = kvm_request_irq_source_id(kvm); if (pit->irq_source_id < 0) goto fail_request; mutex_init(&pit->pit_state.lock); pid = get_pid(task_tgid(current)); pid_nr = pid_vnr(pid); put_pid(pid); pit->worker = kthread_create_worker(0, "kvm-pit/%d", pid_nr); if (IS_ERR(pit->worker)) goto fail_kthread; kthread_init_work(&pit->expired, pit_do_work); pit->kvm = kvm; pit_state = &pit->pit_state; hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); pit_state->timer.function = pit_timer_fn; pit_state->irq_ack_notifier.gsi = 0; pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; pit->mask_notifier.func = pit_mask_notifer; kvm_pit_reset(pit); kvm_pit_set_reinject(pit, true); mutex_lock(&kvm->slots_lock); kvm_iodevice_init(&pit->dev, &pit_dev_ops); ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS, KVM_PIT_MEM_LENGTH, &pit->dev); if (ret < 0) goto fail_register_pit; if (flags & KVM_PIT_SPEAKER_DUMMY) { kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_SPEAKER_BASE_ADDRESS, 4, &pit->speaker_dev); if (ret < 0) goto fail_register_speaker; } mutex_unlock(&kvm->slots_lock); return pit; fail_register_speaker: kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); fail_register_pit: mutex_unlock(&kvm->slots_lock); kvm_pit_set_reinject(pit, false); kthread_destroy_worker(pit->worker); fail_kthread: kvm_free_irq_source_id(kvm, pit->irq_source_id); fail_request: kfree(pit); return NULL; } void kvm_free_pit(struct kvm *kvm) { struct kvm_pit *pit = kvm->arch.vpit; if (pit) { mutex_lock(&kvm->slots_lock); kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev); mutex_unlock(&kvm->slots_lock); kvm_pit_set_reinject(pit, false); hrtimer_cancel(&pit->pit_state.timer); kthread_destroy_worker(pit->worker); kvm_free_irq_source_id(kvm, pit->irq_source_id); kfree(pit); } } |
61 48 47 580 63 24 8 1 5 3 1 1 4 4 4 4 4 4 1 5 3 1 1 5 1 4 4 2 2 1 7 2 3 3 5 2 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 | // SPDX-License-Identifier: GPL-2.0-only /* * kernel/power/main.c - PM subsystem core functionality. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab */ #include <linux/export.h> #include <linux/kobject.h> #include <linux/string.h> #include <linux/pm-trace.h> #include <linux/workqueue.h> #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/suspend.h> #include <linux/syscalls.h> #include <linux/pm_runtime.h> #include <trace/events/power.h> #include <trace/hooks/power.h> #include "power.h" #ifdef CONFIG_PM_SLEEP unsigned int lock_system_sleep(void) { unsigned int flags = current->flags; current->flags |= PF_NOFREEZE; mutex_lock(&system_transition_mutex); return flags; } EXPORT_SYMBOL_GPL(lock_system_sleep); void unlock_system_sleep(unsigned int flags) { /* * Don't use freezer_count() because we don't want the call to * try_to_freeze() here. * * Reason: * Fundamentally, we just don't need it, because freezing condition * doesn't come into effect until we release the * system_transition_mutex lock, since the freezer always works with * system_transition_mutex held. * * More importantly, in the case of hibernation, * unlock_system_sleep() gets called in snapshot_read() and * snapshot_write() when the freezing condition is still in effect. * Which means, if we use try_to_freeze() here, it would make them * enter the refrigerator, thus causing hibernation to lockup. */ if (!(flags & PF_NOFREEZE)) current->flags &= ~PF_NOFREEZE; mutex_unlock(&system_transition_mutex); } EXPORT_SYMBOL_GPL(unlock_system_sleep); void ksys_sync_helper(void) { ktime_t start; long elapsed_msecs; start = ktime_get(); ksys_sync(); elapsed_msecs = ktime_to_ms(ktime_sub(ktime_get(), start)); pr_info("Filesystems sync: %ld.%03ld seconds\n", elapsed_msecs / MSEC_PER_SEC, elapsed_msecs % MSEC_PER_SEC); } EXPORT_SYMBOL_GPL(ksys_sync_helper); /* Routines for PM-transition notifications */ static BLOCKING_NOTIFIER_HEAD(pm_chain_head); int register_pm_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&pm_chain_head, nb); } EXPORT_SYMBOL_GPL(register_pm_notifier); int unregister_pm_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&pm_chain_head, nb); } EXPORT_SYMBOL_GPL(unregister_pm_notifier); int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down) { int ret; ret = blocking_notifier_call_chain_robust(&pm_chain_head, val_up, val_down, NULL); return notifier_to_errno(ret); } int pm_notifier_call_chain(unsigned long val) { return blocking_notifier_call_chain(&pm_chain_head, val, NULL); } /* If set, devices may be suspended and resumed asynchronously. */ int pm_async_enabled = 1; static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sprintf(buf, "%d\n", pm_async_enabled); } static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long val; if (kstrtoul(buf, 10, &val)) return -EINVAL; if (val > 1) return -EINVAL; pm_async_enabled = val; return n; } power_attr(pm_async); #ifdef CONFIG_SUSPEND static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { char *s = buf; suspend_state_t i; for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) { if (i >= PM_SUSPEND_MEM && cxl_mem_active()) continue; if (mem_sleep_states[i]) { const char *label = mem_sleep_states[i]; if (mem_sleep_current == i) s += sprintf(s, "[%s] ", label); else s += sprintf(s, "%s ", label); } } /* Convert the last space to a newline if needed. */ if (s != buf) *(s-1) = '\n'; return (s - buf); } static suspend_state_t decode_suspend_state(const char *buf, size_t n) { suspend_state_t state; char *p; int len; p = memchr(buf, '\n', n); len = p ? p - buf : n; for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) { const char *label = mem_sleep_states[state]; if (label && len == strlen(label) && !strncmp(buf, label, len)) return state; } return PM_SUSPEND_ON; } static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { suspend_state_t state; int error; error = pm_autosleep_lock(); if (error) return error; if (pm_autosleep_state() > PM_SUSPEND_ON) { error = -EBUSY; goto out; } state = decode_suspend_state(buf, n); if (state < PM_SUSPEND_MAX && state > PM_SUSPEND_ON) mem_sleep_current = state; else error = -EINVAL; out: pm_autosleep_unlock(); return error ? error : n; } power_attr(mem_sleep); /* * sync_on_suspend: invoke ksys_sync_helper() before suspend. * * show() returns whether ksys_sync_helper() is invoked before suspend. * store() accepts 0 or 1. 0 disables ksys_sync_helper() and 1 enables it. */ bool sync_on_suspend_enabled = !IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC); static ssize_t sync_on_suspend_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sprintf(buf, "%d\n", sync_on_suspend_enabled); } static ssize_t sync_on_suspend_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long val; if (kstrtoul(buf, 10, &val)) return -EINVAL; if (val > 1) return -EINVAL; sync_on_suspend_enabled = !!val; return n; } power_attr(sync_on_suspend); #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_PM_SLEEP_DEBUG int pm_test_level = TEST_NONE; static const char * const pm_tests[__TEST_AFTER_LAST] = { [TEST_NONE] = "none", [TEST_CORE] = "core", [TEST_CPUS] = "processors", [TEST_PLATFORM] = "platform", [TEST_DEVICES] = "devices", [TEST_FREEZER] = "freezer", }; static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { char *s = buf; int level; for (level = TEST_FIRST; level <= TEST_MAX; level++) if (pm_tests[level]) { if (level == pm_test_level) s += sprintf(s, "[%s] ", pm_tests[level]); else s += sprintf(s, "%s ", pm_tests[level]); } if (s != buf) /* convert the last space to a newline */ *(s-1) = '\n'; return (s - buf); } static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned int sleep_flags; const char * const *s; int error = -EINVAL; int level; char *p; int len; p = memchr(buf, '\n', n); len = p ? p - buf : n; sleep_flags = lock_system_sleep(); level = TEST_FIRST; for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { pm_test_level = level; error = 0; break; } unlock_system_sleep(sleep_flags); return error ? error : n; } power_attr(pm_test); #endif /* CONFIG_PM_SLEEP_DEBUG */ static char *suspend_step_name(enum suspend_stat_step step) { switch (step) { case SUSPEND_FREEZE: return "freeze"; case SUSPEND_PREPARE: return "prepare"; case SUSPEND_SUSPEND: return "suspend"; case SUSPEND_SUSPEND_NOIRQ: return "suspend_noirq"; case SUSPEND_RESUME_NOIRQ: return "resume_noirq"; case SUSPEND_RESUME: return "resume"; default: return ""; } } #define suspend_attr(_name) \ static ssize_t _name##_show(struct kobject *kobj, \ struct kobj_attribute *attr, char *buf) \ { \ return sprintf(buf, "%d\n", suspend_stats._name); \ } \ static struct kobj_attribute _name = __ATTR_RO(_name) suspend_attr(success); suspend_attr(fail); suspend_attr(failed_freeze); suspend_attr(failed_prepare); suspend_attr(failed_suspend); suspend_attr(failed_suspend_late); suspend_attr(failed_suspend_noirq); suspend_attr(failed_resume); suspend_attr(failed_resume_early); suspend_attr(failed_resume_noirq); static ssize_t last_failed_dev_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { int index; char *last_failed_dev = NULL; index = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; index %= REC_FAILED_NUM; last_failed_dev = suspend_stats.failed_devs[index]; return sprintf(buf, "%s\n", last_failed_dev); } static struct kobj_attribute last_failed_dev = __ATTR_RO(last_failed_dev); static ssize_t last_failed_errno_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { int index; int last_failed_errno; index = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1; index %= REC_FAILED_NUM; last_failed_errno = suspend_stats.errno[index]; return sprintf(buf, "%d\n", last_failed_errno); } static struct kobj_attribute last_failed_errno = __ATTR_RO(last_failed_errno); static ssize_t last_failed_step_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { int index; enum suspend_stat_step step; char *last_failed_step = NULL; index = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; index %= REC_FAILED_NUM; step = suspend_stats.failed_steps[index]; last_failed_step = suspend_step_name(step); return sprintf(buf, "%s\n", last_failed_step); } static struct kobj_attribute last_failed_step = __ATTR_RO(last_failed_step); static struct attribute *suspend_attrs[] = { &success.attr, &fail.attr, &failed_freeze.attr, &failed_prepare.attr, &failed_suspend.attr, &failed_suspend_late.attr, &failed_suspend_noirq.attr, &failed_resume.attr, &failed_resume_early.attr, &failed_resume_noirq.attr, &last_failed_dev.attr, &last_failed_errno.attr, &last_failed_step.attr, NULL, }; static const struct attribute_group suspend_attr_group = { .name = "suspend_stats", .attrs = suspend_attrs, }; #ifdef CONFIG_DEBUG_FS static int suspend_stats_show(struct seq_file *s, void *unused) { int i, index, last_dev, last_errno, last_step; last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; last_dev %= REC_FAILED_NUM; last_errno = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1; last_errno %= REC_FAILED_NUM; last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; last_step %= REC_FAILED_NUM; seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n" "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n", "success", suspend_stats.success, "fail", suspend_stats.fail, "failed_freeze", suspend_stats.failed_freeze, "failed_prepare", suspend_stats.failed_prepare, "failed_suspend", suspend_stats.failed_suspend, "failed_suspend_late", suspend_stats.failed_suspend_late, "failed_suspend_noirq", suspend_stats.failed_suspend_noirq, "failed_resume", suspend_stats.failed_resume, "failed_resume_early", suspend_stats.failed_resume_early, "failed_resume_noirq", suspend_stats.failed_resume_noirq); seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", suspend_stats.failed_devs[last_dev]); for (i = 1; i < REC_FAILED_NUM; i++) { index = last_dev + REC_FAILED_NUM - i; index %= REC_FAILED_NUM; seq_printf(s, "\t\t\t%-s\n", suspend_stats.failed_devs[index]); } seq_printf(s, " last_failed_errno:\t%-d\n", suspend_stats.errno[last_errno]); for (i = 1; i < REC_FAILED_NUM; i++) { index = last_errno + REC_FAILED_NUM - i; index %= REC_FAILED_NUM; seq_printf(s, "\t\t\t%-d\n", suspend_stats.errno[index]); } seq_printf(s, " last_failed_step:\t%-s\n", suspend_step_name( suspend_stats.failed_steps[last_step])); for (i = 1; i < REC_FAILED_NUM; i++) { index = last_step + REC_FAILED_NUM - i; index %= REC_FAILED_NUM; seq_printf(s, "\t\t\t%-s\n", suspend_step_name( suspend_stats.failed_steps[index])); } return 0; } DEFINE_SHOW_ATTRIBUTE(suspend_stats); static int __init pm_debugfs_init(void) { debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO, NULL, NULL, &suspend_stats_fops); return 0; } late_initcall(pm_debugfs_init); #endif /* CONFIG_DEBUG_FS */ #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_SLEEP_DEBUG /* * pm_print_times: print time taken by devices to suspend and resume. * * show() returns whether printing of suspend and resume times is enabled. * store() accepts 0 or 1. 0 disables printing and 1 enables it. */ bool pm_print_times_enabled; static ssize_t pm_print_times_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sprintf(buf, "%d\n", pm_print_times_enabled); } static ssize_t pm_print_times_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long val; if (kstrtoul(buf, 10, &val)) return -EINVAL; if (val > 1) return -EINVAL; pm_print_times_enabled = !!val; return n; } power_attr(pm_print_times); static inline void pm_print_times_init(void) { pm_print_times_enabled = !!initcall_debug; } static ssize_t pm_wakeup_irq_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { if (!pm_wakeup_irq()) return -ENODATA; return sprintf(buf, "%u\n", pm_wakeup_irq()); } power_attr_ro(pm_wakeup_irq); bool pm_debug_messages_on __read_mostly; static ssize_t pm_debug_messages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sprintf(buf, "%d\n", pm_debug_messages_on); } static ssize_t pm_debug_messages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long val; if (kstrtoul(buf, 10, &val)) return -EINVAL; if (val > 1) return -EINVAL; pm_debug_messages_on = !!val; return n; } power_attr(pm_debug_messages); static int __init pm_debug_messages_setup(char *str) { pm_debug_messages_on = true; return 1; } __setup("pm_debug_messages", pm_debug_messages_setup); #else /* !CONFIG_PM_SLEEP_DEBUG */ static inline void pm_print_times_init(void) {} #endif /* CONFIG_PM_SLEEP_DEBUG */ struct kobject *power_kobj; /* * state - control system sleep states. * * show() returns available sleep state labels, which may be "mem", "standby", * "freeze" and "disk" (hibernation). * See Documentation/admin-guide/pm/sleep-states.rst for a description of * what they mean. * * store() accepts one of those strings, translates it into the proper * enumerated value, and initiates a suspend transition. */ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { char *s = buf; #ifdef CONFIG_SUSPEND suspend_state_t i; for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) if (pm_states[i]) s += sprintf(s,"%s ", pm_states[i]); #endif if (hibernation_available()) s += sprintf(s, "disk "); if (s != buf) /* convert the last space to a newline */ *(s-1) = '\n'; return (s - buf); } static suspend_state_t decode_state(const char *buf, size_t n) { #ifdef CONFIG_SUSPEND suspend_state_t state; #endif char *p; int len; p = memchr(buf, '\n', n); len = p ? p - buf : n; /* Check hibernation first. */ if (len == 4 && str_has_prefix(buf, "disk")) return PM_SUSPEND_MAX; #ifdef CONFIG_SUSPEND for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) { const char *label = pm_states[state]; if (label && len == strlen(label) && !strncmp(buf, label, len)) return state; } #endif return PM_SUSPEND_ON; } static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { suspend_state_t state; int error; error = pm_autosleep_lock(); if (error) return error; if (pm_autosleep_state() > PM_SUSPEND_ON) { error = -EBUSY; goto out; } state = decode_state(buf, n); if (state < PM_SUSPEND_MAX) { if (state == PM_SUSPEND_MEM) state = mem_sleep_current; error = pm_suspend(state); } else if (state == PM_SUSPEND_MAX) { error = hibernate(); trace_android_vh_hibernate_state(error); } else { error = -EINVAL; } out: pm_autosleep_unlock(); return error ? error : n; } power_attr(state); #ifdef CONFIG_PM_SLEEP /* * The 'wakeup_count' attribute, along with the functions defined in * drivers/base/power/wakeup.c, provides a means by which wakeup events can be * handled in a non-racy way. * * If a wakeup event occurs when the system is in a sleep state, it simply is * woken up. In turn, if an event that would wake the system up from a sleep * state occurs when it is undergoing a transition to that sleep state, the * transition should be aborted. Moreover, if such an event occurs when the * system is in the working state, an attempt to start a transition to the * given sleep state should fail during certain period after the detection of * the event. Using the 'state' attribute alone is not sufficient to satisfy * these requirements, because a wakeup event may occur exactly when 'state' * is being written to and may be delivered to user space right before it is * frozen, so the event will remain only partially processed until the system is * woken up by another event. In particular, it won't cause the transition to * a sleep state to be aborted. * * This difficulty may be overcome if user space uses 'wakeup_count' before * writing to 'state'. It first should read from 'wakeup_count' and store * the read value. Then, after carrying out its own preparations for the system * transition to a sleep state, it should write the stored value to * 'wakeup_count'. If that fails, at least one wakeup event has occurred since * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it * is allowed to write to 'state', but the transition will be aborted if there * are any wakeup events detected after 'wakeup_count' was written to. */ static ssize_t wakeup_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { unsigned int val; return pm_get_wakeup_count(&val, true) ? sprintf(buf, "%u\n", val) : -EINTR; } static ssize_t wakeup_count_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned int val; int error; error = pm_autosleep_lock(); if (error) return error; if (pm_autosleep_state() > PM_SUSPEND_ON) { error = -EBUSY; goto out; } error = -EINVAL; if (sscanf(buf, "%u", &val) == 1) { if (pm_save_wakeup_count(val)) error = n; else pm_print_active_wakeup_sources(); } out: pm_autosleep_unlock(); return error; } power_attr(wakeup_count); #ifdef CONFIG_PM_AUTOSLEEP static ssize_t autosleep_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { suspend_state_t state = pm_autosleep_state(); if (state == PM_SUSPEND_ON) return sprintf(buf, "off\n"); #ifdef CONFIG_SUSPEND if (state < PM_SUSPEND_MAX) return sprintf(buf, "%s\n", pm_states[state] ? pm_states[state] : "error"); #endif #ifdef CONFIG_HIBERNATION return sprintf(buf, "disk\n"); #else return sprintf(buf, "error"); #endif } static ssize_t autosleep_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { suspend_state_t state = decode_state(buf, n); int error; if (state == PM_SUSPEND_ON && strcmp(buf, "off") && strcmp(buf, "off\n")) return -EINVAL; if (state == PM_SUSPEND_MEM) state = mem_sleep_current; error = pm_autosleep_set_state(state); return error ? error : n; } power_attr(autosleep); #endif /* CONFIG_PM_AUTOSLEEP */ #ifdef CONFIG_PM_WAKELOCKS static ssize_t wake_lock_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return pm_show_wakelocks(buf, true); } static ssize_t wake_lock_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { int error = pm_wake_lock(buf); return error ? error : n; } power_attr(wake_lock); static ssize_t wake_unlock_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return pm_show_wakelocks(buf, false); } static ssize_t wake_unlock_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { int error = pm_wake_unlock(buf); return error ? error : n; } power_attr(wake_unlock); #endif /* CONFIG_PM_WAKELOCKS */ #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_TRACE int pm_trace_enabled; static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sprintf(buf, "%d\n", pm_trace_enabled); } static ssize_t pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { int val; if (sscanf(buf, "%d", &val) == 1) { pm_trace_enabled = !!val; if (pm_trace_enabled) { pr_warn("PM: Enabling pm_trace changes system date and time during resume.\n" "PM: Correct system time has to be restored manually after resume.\n"); } return n; } return -EINVAL; } power_attr(pm_trace); static ssize_t pm_trace_dev_match_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return show_trace_dev_match(buf, PAGE_SIZE); } power_attr_ro(pm_trace_dev_match); #endif /* CONFIG_PM_TRACE */ #ifdef CONFIG_FREEZER static ssize_t pm_freeze_timeout_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sprintf(buf, "%u\n", freeze_timeout_msecs); } static ssize_t pm_freeze_timeout_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long val; if (kstrtoul(buf, 10, &val)) return -EINVAL; freeze_timeout_msecs = val; return n; } power_attr(pm_freeze_timeout); #endif /* CONFIG_FREEZER*/ static struct attribute * g[] = { &state_attr.attr, #ifdef CONFIG_PM_TRACE &pm_trace_attr.attr, &pm_trace_dev_match_attr.attr, #endif #ifdef CONFIG_PM_SLEEP &pm_async_attr.attr, &wakeup_count_attr.attr, #ifdef CONFIG_SUSPEND &mem_sleep_attr.attr, &sync_on_suspend_attr.attr, #endif #ifdef CONFIG_PM_AUTOSLEEP &autosleep_attr.attr, #endif #ifdef CONFIG_PM_WAKELOCKS &wake_lock_attr.attr, &wake_unlock_attr.attr, #endif #ifdef CONFIG_PM_SLEEP_DEBUG &pm_test_attr.attr, &pm_print_times_attr.attr, &pm_wakeup_irq_attr.attr, &pm_debug_messages_attr.attr, #endif #endif #ifdef CONFIG_FREEZER &pm_freeze_timeout_attr.attr, #endif NULL, }; static const struct attribute_group attr_group = { .attrs = g, }; static const struct attribute_group *attr_groups[] = { &attr_group, #ifdef CONFIG_PM_SLEEP &suspend_attr_group, #endif NULL, }; struct workqueue_struct *pm_wq; EXPORT_SYMBOL_GPL(pm_wq); static int __init pm_start_workqueue(void) { pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0); return pm_wq ? 0 : -ENOMEM; } static int __init pm_init(void) { int error = pm_start_workqueue(); if (error) return error; hibernate_image_size_init(); hibernate_reserved_size_init(); pm_states_init(); power_kobj = kobject_create_and_add("power", NULL); if (!power_kobj) return -ENOMEM; error = sysfs_create_groups(power_kobj, attr_groups); if (error) return error; pm_print_times_init(); return pm_autosleep_init(); } core_initcall(pm_init); |
268 81 289 245 180 193 130 1491 1036 663 1071 803 311 916 68 827 130 1756 173 173 232 232 6 1 1 2154 45 45 87 87 2231 123 63 6807 63 6794 6804 6791 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 | // SPDX-License-Identifier: GPL-2.0-only /* * common LSM auditing functions * * Based on code written for SELinux by : * Stephen Smalley, <sds@tycho.nsa.gov> * James Morris <jmorris@redhat.com> * Author : Etienne Basset, <etienne.basset@ensta.org> */ #include <linux/types.h> #include <linux/stddef.h> #include <linux/kernel.h> #include <linux/gfp.h> #include <linux/fs.h> #include <linux/init.h> #include <net/sock.h> #include <linux/un.h> #include <net/af_unix.h> #include <linux/audit.h> #include <linux/ipv6.h> #include <linux/ip.h> #include <net/ip.h> #include <net/ipv6.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/dccp.h> #include <linux/sctp.h> #include <linux/lsm_audit.h> #include <linux/security.h> /** * ipv4_skb_to_auditdata : fill auditdata from skb * @skb : the skb * @ad : the audit data to fill * @proto : the layer 4 protocol * * return 0 on success */ int ipv4_skb_to_auditdata(struct sk_buff *skb, struct common_audit_data *ad, u8 *proto) { int ret = 0; struct iphdr *ih; ih = ip_hdr(skb); ad->u.net->v4info.saddr = ih->saddr; ad->u.net->v4info.daddr = ih->daddr; if (proto) *proto = ih->protocol; /* non initial fragment */ if (ntohs(ih->frag_off) & IP_OFFSET) return 0; switch (ih->protocol) { case IPPROTO_TCP: { struct tcphdr *th = tcp_hdr(skb); ad->u.net->sport = th->source; ad->u.net->dport = th->dest; break; } case IPPROTO_UDP: { struct udphdr *uh = udp_hdr(skb); ad->u.net->sport = uh->source; ad->u.net->dport = uh->dest; break; } case IPPROTO_DCCP: { struct dccp_hdr *dh = dccp_hdr(skb); ad->u.net->sport = dh->dccph_sport; ad->u.net->dport = dh->dccph_dport; break; } case IPPROTO_SCTP: { struct sctphdr *sh = sctp_hdr(skb); ad->u.net->sport = sh->source; ad->u.net->dport = sh->dest; break; } default: ret = -EINVAL; } return ret; } #if IS_ENABLED(CONFIG_IPV6) /** * ipv6_skb_to_auditdata : fill auditdata from skb * @skb : the skb * @ad : the audit data to fill * @proto : the layer 4 protocol * * return 0 on success */ int ipv6_skb_to_auditdata(struct sk_buff *skb, struct common_audit_data *ad, u8 *proto) { int offset, ret = 0; struct ipv6hdr *ip6; u8 nexthdr; __be16 frag_off; ip6 = ipv6_hdr(skb); ad->u.net->v6info.saddr = ip6->saddr; ad->u.net->v6info.daddr = ip6->daddr; /* IPv6 can have several extension header before the Transport header * skip them */ offset = skb_network_offset(skb); offset += sizeof(*ip6); nexthdr = ip6->nexthdr; offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off); if (offset < 0) return 0; if (proto) *proto = nexthdr; switch (nexthdr) { case IPPROTO_TCP: { struct tcphdr _tcph, *th; th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); if (th == NULL) break; ad->u.net->sport = th->source; ad->u.net->dport = th->dest; break; } case IPPROTO_UDP: { struct udphdr _udph, *uh; uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); if (uh == NULL) break; ad->u.net->sport = uh->source; ad->u.net->dport = uh->dest; break; } case IPPROTO_DCCP: { struct dccp_hdr _dccph, *dh; dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph); if (dh == NULL) break; ad->u.net->sport = dh->dccph_sport; ad->u.net->dport = dh->dccph_dport; break; } case IPPROTO_SCTP: { struct sctphdr _sctph, *sh; sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph); if (sh == NULL) break; ad->u.net->sport = sh->source; ad->u.net->dport = sh->dest; break; } default: ret = -EINVAL; } return ret; } #endif static inline void print_ipv6_addr(struct audit_buffer *ab, const struct in6_addr *addr, __be16 port, char *name1, char *name2) { if (!ipv6_addr_any(addr)) audit_log_format(ab, " %s=%pI6c", name1, addr); if (port) audit_log_format(ab, " %s=%d", name2, ntohs(port)); } static inline void print_ipv4_addr(struct audit_buffer *ab, __be32 addr, __be16 port, char *name1, char *name2) { if (addr) audit_log_format(ab, " %s=%pI4", name1, &addr); if (port) audit_log_format(ab, " %s=%d", name2, ntohs(port)); } /** * dump_common_audit_data - helper to dump common audit data * @a : common audit data * */ static void dump_common_audit_data(struct audit_buffer *ab, struct common_audit_data *a) { char comm[sizeof(current->comm)]; /* * To keep stack sizes in check force programers to notice if they * start making this union too large! See struct lsm_network_audit * as an example of how to deal with large data. */ BUILD_BUG_ON(sizeof(a->u) > sizeof(void *)*2); audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current)); audit_log_untrustedstring(ab, memcpy(comm, current->comm, sizeof(comm))); switch (a->type) { case LSM_AUDIT_DATA_NONE: return; case LSM_AUDIT_DATA_IPC: audit_log_format(ab, " ipc_key=%d ", a->u.ipc_id); break; case LSM_AUDIT_DATA_CAP: audit_log_format(ab, " capability=%d ", a->u.cap); break; case LSM_AUDIT_DATA_PATH: { struct inode *inode; audit_log_d_path(ab, " path=", &a->u.path); inode = d_backing_inode(a->u.path.dentry); if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } break; } case LSM_AUDIT_DATA_FILE: { struct inode *inode; audit_log_d_path(ab, " path=", &a->u.file->f_path); inode = file_inode(a->u.file); if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } break; } case LSM_AUDIT_DATA_IOCTL_OP: { struct inode *inode; audit_log_d_path(ab, " path=", &a->u.op->path); inode = a->u.op->path.dentry->d_inode; if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } audit_log_format(ab, " ioctlcmd=0x%hx", a->u.op->cmd); break; } case LSM_AUDIT_DATA_DENTRY: { struct inode *inode; audit_log_format(ab, " name="); spin_lock(&a->u.dentry->d_lock); audit_log_untrustedstring(ab, a->u.dentry->d_name.name); spin_unlock(&a->u.dentry->d_lock); inode = d_backing_inode(a->u.dentry); if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } break; } case LSM_AUDIT_DATA_INODE: { struct dentry *dentry; struct inode *inode; rcu_read_lock(); inode = a->u.inode; dentry = d_find_alias_rcu(inode); if (dentry) { audit_log_format(ab, " name="); spin_lock(&dentry->d_lock); audit_log_untrustedstring(ab, dentry->d_name.name); spin_unlock(&dentry->d_lock); } audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); rcu_read_unlock(); break; } case LSM_AUDIT_DATA_TASK: { struct task_struct *tsk = a->u.tsk; if (tsk) { pid_t pid = task_tgid_nr(tsk); if (pid) { char comm[sizeof(tsk->comm)]; audit_log_format(ab, " opid=%d ocomm=", pid); audit_log_untrustedstring(ab, memcpy(comm, tsk->comm, sizeof(comm))); } } break; } case LSM_AUDIT_DATA_NET: if (a->u.net->sk) { const struct sock *sk = a->u.net->sk; struct unix_sock *u; struct unix_address *addr; int len = 0; char *p = NULL; switch (sk->sk_family) { case AF_INET: { struct inet_sock *inet = inet_sk(sk); print_ipv4_addr(ab, inet->inet_rcv_saddr, inet->inet_sport, "laddr", "lport"); print_ipv4_addr(ab, inet->inet_daddr, inet->inet_dport, "faddr", "fport"); break; } #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { struct inet_sock *inet = inet_sk(sk); print_ipv6_addr(ab, &sk->sk_v6_rcv_saddr, inet->inet_sport, "laddr", "lport"); print_ipv6_addr(ab, &sk->sk_v6_daddr, inet->inet_dport, "faddr", "fport"); break; } #endif case AF_UNIX: u = unix_sk(sk); addr = smp_load_acquire(&u->addr); if (!addr) break; if (u->path.dentry) { audit_log_d_path(ab, " path=", &u->path); break; } len = addr->len-sizeof(short); p = &addr->name->sun_path[0]; audit_log_format(ab, " path="); if (*p) audit_log_untrustedstring(ab, p); else audit_log_n_hex(ab, p, len); break; } } switch (a->u.net->family) { case AF_INET: print_ipv4_addr(ab, a->u.net->v4info.saddr, a->u.net->sport, "saddr", "src"); print_ipv4_addr(ab, a->u.net->v4info.daddr, a->u.net->dport, "daddr", "dest"); break; case AF_INET6: print_ipv6_addr(ab, &a->u.net->v6info.saddr, a->u.net->sport, "saddr", "src"); print_ipv6_addr(ab, &a->u.net->v6info.daddr, a->u.net->dport, "daddr", "dest"); break; } if (a->u.net->netif > 0) { struct net_device *dev; /* NOTE: we always use init's namespace */ dev = dev_get_by_index(&init_net, a->u.net->netif); if (dev) { audit_log_format(ab, " netif=%s", dev->name); dev_put(dev); } } break; #ifdef CONFIG_KEYS case LSM_AUDIT_DATA_KEY: audit_log_format(ab, " key_serial=%u", a->u.key_struct.key); if (a->u.key_struct.key_desc) { audit_log_format(ab, " key_desc="); audit_log_untrustedstring(ab, a->u.key_struct.key_desc); } break; #endif case LSM_AUDIT_DATA_KMOD: audit_log_format(ab, " kmod="); audit_log_untrustedstring(ab, a->u.kmod_name); break; case LSM_AUDIT_DATA_IBPKEY: { struct in6_addr sbn_pfx; memset(&sbn_pfx.s6_addr, 0, sizeof(sbn_pfx.s6_addr)); memcpy(&sbn_pfx.s6_addr, &a->u.ibpkey->subnet_prefix, sizeof(a->u.ibpkey->subnet_prefix)); audit_log_format(ab, " pkey=0x%x subnet_prefix=%pI6c", a->u.ibpkey->pkey, &sbn_pfx); break; } case LSM_AUDIT_DATA_IBENDPORT: audit_log_format(ab, " device=%s port_num=%u", a->u.ibendport->dev_name, a->u.ibendport->port); break; case LSM_AUDIT_DATA_LOCKDOWN: audit_log_format(ab, " lockdown_reason=\"%s\"", lockdown_reasons[a->u.reason]); break; case LSM_AUDIT_DATA_ANONINODE: audit_log_format(ab, " anonclass=%s", a->u.anonclass); break; } /* switch (a->type) */ } /** * common_lsm_audit - generic LSM auditing function * @a: auxiliary audit data * @pre_audit: lsm-specific pre-audit callback * @post_audit: lsm-specific post-audit callback * * setup the audit buffer for common security information * uses callback to print LSM specific information */ void common_lsm_audit(struct common_audit_data *a, void (*pre_audit)(struct audit_buffer *, void *), void (*post_audit)(struct audit_buffer *, void *)) { struct audit_buffer *ab; if (a == NULL) return; /* we use GFP_ATOMIC so we won't sleep */ ab = audit_log_start(audit_context(), GFP_ATOMIC | __GFP_NOWARN, AUDIT_AVC); if (ab == NULL) return; if (pre_audit) pre_audit(ab, a); dump_common_audit_data(ab, a); if (post_audit) post_audit(ab, a); audit_log_end(ab); } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __MEMTYPE_H_ #define __MEMTYPE_H_ extern int pat_debug_enable; #define dprintk(fmt, arg...) \ do { if (pat_debug_enable) pr_info("x86/PAT: " fmt, ##arg); } while (0) struct memtype { u64 start; u64 end; u64 subtree_max_end; enum page_cache_mode type; struct rb_node rb; }; static inline char *cattr_name(enum page_cache_mode pcm) { switch (pcm) { case _PAGE_CACHE_MODE_UC: return "uncached"; case _PAGE_CACHE_MODE_UC_MINUS: return "uncached-minus"; case _PAGE_CACHE_MODE_WB: return "write-back"; case _PAGE_CACHE_MODE_WC: return "write-combining"; case _PAGE_CACHE_MODE_WT: return "write-through"; case _PAGE_CACHE_MODE_WP: return "write-protected"; default: return "broken"; } } #ifdef CONFIG_X86_PAT extern int memtype_check_insert(struct memtype *entry_new, enum page_cache_mode *new_type); extern struct memtype *memtype_erase(u64 start, u64 end); extern struct memtype *memtype_lookup(u64 addr); extern int memtype_copy_nth_element(struct memtype *entry_out, loff_t pos); #else static inline int memtype_check_insert(struct memtype *entry_new, enum page_cache_mode *new_type) { return 0; } static inline struct memtype *memtype_erase(u64 start, u64 end) { return NULL; } static inline struct memtype *memtype_lookup(u64 addr) { return NULL; } static inline int memtype_copy_nth_element(struct memtype *out, loff_t pos) { return 0; } #endif #endif /* __MEMTYPE_H_ */ |
13 13 13 13 13 13 13 13 12 11 11 11 11 11 11 11 11 11 11 11 13 13 13 11 11 13 13 11 11 12 3 13 13 13 13 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 | // SPDX-License-Identifier: GPL-2.0 /* * List pending timers * * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar */ #include <linux/proc_fs.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/kallsyms.h> #include <linux/nmi.h> #include <linux/uaccess.h> #include "tick-internal.h" struct timer_list_iter { int cpu; bool second_pass; u64 now; }; /* * This allows printing both to /proc/timer_list and * to the console (on SysRq-Q): */ __printf(2, 3) static void SEQ_printf(struct seq_file *m, const char *fmt, ...) { va_list args; va_start(args, fmt); if (m) seq_vprintf(m, fmt, args); else vprintk(fmt, args); va_end(args); } static void print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, int idx, u64 now) { SEQ_printf(m, " #%d: <%pK>, %ps", idx, taddr, timer->function); SEQ_printf(m, ", S:%02x", timer->state); SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)), (unsigned long long)ktime_to_ns(hrtimer_get_expires(timer)), (long long)(ktime_to_ns(hrtimer_get_softexpires(timer)) - now), (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now)); } static void print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { struct hrtimer *timer, tmp; unsigned long next = 0, i; struct timerqueue_node *curr; unsigned long flags; next_one: i = 0; touch_nmi_watchdog(); raw_spin_lock_irqsave(&base->cpu_base->lock, flags); curr = timerqueue_getnext(&base->active); /* * Crude but we have to do this O(N*N) thing, because * we have to unlock the base when printing: */ while (curr && i < next) { curr = timerqueue_iterate_next(curr); i++; } if (curr) { timer = container_of(curr, struct hrtimer, node); tmp = *timer; raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); print_timer(m, timer, &tmp, i, now); next++; goto next_one; } raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); } static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { SEQ_printf(m, " .base: %pK\n", base); SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); SEQ_printf(m, " .get_time: %ps\n", base->get_time); #ifdef CONFIG_HIGH_RES_TIMERS SEQ_printf(m, " .offset: %Lu nsecs\n", (unsigned long long) ktime_to_ns(base->offset)); #endif SEQ_printf(m, "active timers:\n"); print_active_timers(m, base, now + ktime_to_ns(base->offset)); } static void print_cpu(struct seq_file *m, int cpu, u64 now) { struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; SEQ_printf(m, "cpu: %d\n", cpu); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { SEQ_printf(m, " clock %d:\n", i); print_base(m, cpu_base->clock_base + i, now); } #define P(x) \ SEQ_printf(m, " .%-15s: %Lu\n", #x, \ (unsigned long long)(cpu_base->x)) #define P_ns(x) \ SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \ (unsigned long long)(ktime_to_ns(cpu_base->x))) #ifdef CONFIG_HIGH_RES_TIMERS P_ns(expires_next); P(hres_active); P(nr_events); P(nr_retries); P(nr_hangs); P(max_hang_time); #endif #undef P #undef P_ns #ifdef CONFIG_TICK_ONESHOT # define P(x) \ SEQ_printf(m, " .%-15s: %Lu\n", #x, \ (unsigned long long)(ts->x)) # define P_ns(x) \ SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \ (unsigned long long)(ktime_to_ns(ts->x))) { struct tick_sched *ts = tick_get_tick_sched(cpu); P(nohz_mode); P_ns(last_tick); P(tick_stopped); P(idle_jiffies); P(idle_calls); P(idle_sleeps); P_ns(idle_entrytime); P_ns(idle_waketime); P_ns(idle_exittime); P_ns(idle_sleeptime); P_ns(iowait_sleeptime); P(last_jiffies); P(next_timer); P_ns(idle_expires); SEQ_printf(m, "jiffies: %Lu\n", (unsigned long long)jiffies); } #endif #undef P #undef P_ns SEQ_printf(m, "\n"); } #ifdef CONFIG_GENERIC_CLOCKEVENTS static void print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) { struct clock_event_device *dev = td->evtdev; touch_nmi_watchdog(); SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); if (cpu < 0) SEQ_printf(m, "Broadcast device\n"); else SEQ_printf(m, "Per CPU device: %d\n", cpu); SEQ_printf(m, "Clock Event Device: "); if (!dev) { SEQ_printf(m, "<NULL>\n"); return; } SEQ_printf(m, "%s\n", dev->name); SEQ_printf(m, " max_delta_ns: %llu\n", (unsigned long long) dev->max_delta_ns); SEQ_printf(m, " min_delta_ns: %llu\n", (unsigned long long) dev->min_delta_ns); SEQ_printf(m, " mult: %u\n", dev->mult); SEQ_printf(m, " shift: %u\n", dev->shift); SEQ_printf(m, " mode: %d\n", clockevent_get_state(dev)); SEQ_printf(m, " next_event: %Ld nsecs\n", (unsigned long long) ktime_to_ns(dev->next_event)); SEQ_printf(m, " set_next_event: %ps\n", dev->set_next_event); if (dev->set_state_shutdown) SEQ_printf(m, " shutdown: %ps\n", dev->set_state_shutdown); if (dev->set_state_periodic) SEQ_printf(m, " periodic: %ps\n", dev->set_state_periodic); if (dev->set_state_oneshot) SEQ_printf(m, " oneshot: %ps\n", dev->set_state_oneshot); if (dev->set_state_oneshot_stopped) SEQ_printf(m, " oneshot stopped: %ps\n", dev->set_state_oneshot_stopped); if (dev->tick_resume) SEQ_printf(m, " resume: %ps\n", dev->tick_resume); SEQ_printf(m, " event_handler: %ps\n", dev->event_handler); SEQ_printf(m, "\n"); SEQ_printf(m, " retries: %lu\n", dev->retries); #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST if (cpu >= 0) { const struct clock_event_device *wd = tick_get_wakeup_device(cpu); SEQ_printf(m, "Wakeup Device: %s\n", wd ? wd->name : "<NULL>"); } #endif SEQ_printf(m, "\n"); } static void timer_list_show_tickdevices_header(struct seq_file *m) { #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST print_tickdevice(m, tick_get_broadcast_device(), -1); SEQ_printf(m, "tick_broadcast_mask: %*pb\n", cpumask_pr_args(tick_get_broadcast_mask())); #ifdef CONFIG_TICK_ONESHOT SEQ_printf(m, "tick_broadcast_oneshot_mask: %*pb\n", cpumask_pr_args(tick_get_broadcast_oneshot_mask())); #endif SEQ_printf(m, "\n"); #endif } #endif static inline void timer_list_header(struct seq_file *m, u64 now) { SEQ_printf(m, "Timer List Version: v0.9\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); SEQ_printf(m, "\n"); } void sysrq_timer_list_show(void) { u64 now = ktime_to_ns(ktime_get()); int cpu; timer_list_header(NULL, now); for_each_online_cpu(cpu) print_cpu(NULL, cpu, now); #ifdef CONFIG_GENERIC_CLOCKEVENTS timer_list_show_tickdevices_header(NULL); for_each_online_cpu(cpu) print_tickdevice(NULL, tick_get_device(cpu), cpu); #endif return; } #ifdef CONFIG_PROC_FS static int timer_list_show(struct seq_file *m, void *v) { struct timer_list_iter *iter = v; if (iter->cpu == -1 && !iter->second_pass) timer_list_header(m, iter->now); else if (!iter->second_pass) print_cpu(m, iter->cpu, iter->now); #ifdef CONFIG_GENERIC_CLOCKEVENTS else if (iter->cpu == -1 && iter->second_pass) timer_list_show_tickdevices_header(m); else print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu); #endif return 0; } static void *move_iter(struct timer_list_iter *iter, loff_t offset) { for (; offset; offset--) { iter->cpu = cpumask_next(iter->cpu, cpu_online_mask); if (iter->cpu >= nr_cpu_ids) { #ifdef CONFIG_GENERIC_CLOCKEVENTS if (!iter->second_pass) { iter->cpu = -1; iter->second_pass = true; } else return NULL; #else return NULL; #endif } } return iter; } static void *timer_list_start(struct seq_file *file, loff_t *offset) { struct timer_list_iter *iter = file->private; if (!*offset) iter->now = ktime_to_ns(ktime_get()); iter->cpu = -1; iter->second_pass = false; return move_iter(iter, *offset); } static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset) { struct timer_list_iter *iter = file->private; ++*offset; return move_iter(iter, 1); } static void timer_list_stop(struct seq_file *seq, void *v) { } static const struct seq_operations timer_list_sops = { .start = timer_list_start, .next = timer_list_next, .stop = timer_list_stop, .show = timer_list_show, }; static int __init init_timer_list_procfs(void) { struct proc_dir_entry *pe; pe = proc_create_seq_private("timer_list", 0400, NULL, &timer_list_sops, sizeof(struct timer_list_iter), NULL); if (!pe) return -ENOMEM; return 0; } __initcall(init_timer_list_procfs); #endif |
629 629 629 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | // SPDX-License-Identifier: GPL-2.0 /* * Implementation of the symbol table type. * * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include "symtab.h" static unsigned int symhash(const void *key) { const char *p, *keyp; unsigned int size; unsigned int val; val = 0; keyp = key; size = strlen(keyp); for (p = keyp; (p - keyp) < size; p++) val = (val << 4 | (val >> (8*sizeof(unsigned int)-4))) ^ (*p); return val; } static int symcmp(const void *key1, const void *key2) { const char *keyp1, *keyp2; keyp1 = key1; keyp2 = key2; return strcmp(keyp1, keyp2); } static const struct hashtab_key_params symtab_key_params = { .hash = symhash, .cmp = symcmp, }; int symtab_init(struct symtab *s, unsigned int size) { s->nprim = 0; return hashtab_init(&s->table, size); } int symtab_insert(struct symtab *s, char *name, void *datum) { return hashtab_insert(&s->table, name, datum, symtab_key_params); } void *symtab_search(struct symtab *s, const char *name) { return hashtab_search(&s->table, name, symtab_key_params); } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * acpi_bus.h - ACPI Bus Driver ($Revision: 22 $) * * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com> * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> */ #ifndef __ACPI_BUS_H__ #define __ACPI_BUS_H__ #include <linux/device.h> #include <linux/property.h> /* TBD: Make dynamic */ #define ACPI_MAX_HANDLES 10 struct acpi_handle_list { u32 count; acpi_handle handles[ACPI_MAX_HANDLES]; }; /* acpi_utils.h */ acpi_status acpi_extract_package(union acpi_object *package, struct acpi_buffer *format, struct acpi_buffer *buffer); acpi_status acpi_evaluate_integer(acpi_handle handle, acpi_string pathname, struct acpi_object_list *arguments, unsigned long long *data); acpi_status acpi_evaluate_reference(acpi_handle handle, acpi_string pathname, struct acpi_object_list *arguments, struct acpi_handle_list *list); acpi_status acpi_evaluate_ost(acpi_handle handle, u32 source_event, u32 status_code, struct acpi_buffer *status_buf); acpi_status acpi_get_physical_device_location(acpi_handle handle, struct acpi_pld_info **pld); bool acpi_has_method(acpi_handle handle, char *name); acpi_status acpi_execute_simple_method(acpi_handle handle, char *method, u64 arg); acpi_status acpi_evaluate_ej0(acpi_handle handle); acpi_status acpi_evaluate_lck(acpi_handle handle, int lock); acpi_status acpi_evaluate_reg(acpi_handle handle, u8 space_id, u32 function); bool acpi_ata_match(acpi_handle handle); bool acpi_bay_match(acpi_handle handle); bool acpi_dock_match(acpi_handle handle); bool acpi_check_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 funcs); union acpi_object *acpi_evaluate_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 func, union acpi_object *argv4); static inline union acpi_object * acpi_evaluate_dsm_typed(acpi_handle handle, const guid_t *guid, u64 rev, u64 func, union acpi_object *argv4, acpi_object_type type) { union acpi_object *obj; obj = acpi_evaluate_dsm(handle, guid, rev, func, argv4); if (obj && obj->type != type) { ACPI_FREE(obj); obj = NULL; } return obj; } #define ACPI_INIT_DSM_ARGV4(cnt, eles) \ { \ .package.type = ACPI_TYPE_PACKAGE, \ .package.count = (cnt), \ .package.elements = (eles) \ } bool acpi_dev_found(const char *hid); bool acpi_dev_present(const char *hid, const char *uid, s64 hrv); bool acpi_reduced_hardware(void); #ifdef CONFIG_ACPI struct proc_dir_entry; #define ACPI_BUS_FILE_ROOT "acpi" extern struct proc_dir_entry *acpi_root_dir; enum acpi_bus_device_type { ACPI_BUS_TYPE_DEVICE = 0, ACPI_BUS_TYPE_POWER, ACPI_BUS_TYPE_PROCESSOR, ACPI_BUS_TYPE_THERMAL, ACPI_BUS_TYPE_POWER_BUTTON, ACPI_BUS_TYPE_SLEEP_BUTTON, ACPI_BUS_TYPE_ECDT_EC, ACPI_BUS_DEVICE_TYPE_COUNT }; struct acpi_driver; struct acpi_device; /* * ACPI Scan Handler * ----------------- */ struct acpi_hotplug_profile { struct kobject kobj; int (*scan_dependent)(struct acpi_device *adev); void (*notify_online)(struct acpi_device *adev); bool enabled:1; bool demand_offline:1; }; static inline struct acpi_hotplug_profile *to_acpi_hotplug_profile( struct kobject *kobj) { return container_of(kobj, struct acpi_hotplug_profile, kobj); } struct acpi_scan_handler { const struct acpi_device_id *ids; struct list_head list_node; bool (*match)(const char *idstr, const struct acpi_device_id **matchid); int (*attach)(struct acpi_device *dev, const struct acpi_device_id *id); void (*detach)(struct acpi_device *dev); void (*bind)(struct device *phys_dev); void (*unbind)(struct device *phys_dev); struct acpi_hotplug_profile hotplug; }; /* * ACPI Hotplug Context * -------------------- */ struct acpi_hotplug_context { struct acpi_device *self; int (*notify)(struct acpi_device *, u32); void (*uevent)(struct acpi_device *, u32); void (*fixup)(struct acpi_device *); }; /* * ACPI Driver * ----------- */ typedef int (*acpi_op_add) (struct acpi_device * device); typedef int (*acpi_op_remove) (struct acpi_device * device); typedef void (*acpi_op_notify) (struct acpi_device * device, u32 event); struct acpi_device_ops { acpi_op_add add; acpi_op_remove remove; acpi_op_notify notify; }; #define ACPI_DRIVER_ALL_NOTIFY_EVENTS 0x1 /* system AND device events */ struct acpi_driver { char name[80]; char class[80]; const struct acpi_device_id *ids; /* Supported Hardware IDs */ unsigned int flags; struct acpi_device_ops ops; struct device_driver drv; struct module *owner; }; /* * ACPI Device * ----------- */ /* Status (_STA) */ struct acpi_device_status { u32 present:1; u32 enabled:1; u32 show_in_ui:1; u32 functional:1; u32 battery_present:1; u32 reserved:27; }; /* Flags */ struct acpi_device_flags { u32 dynamic_status:1; u32 removable:1; u32 ejectable:1; u32 power_manageable:1; u32 match_driver:1; u32 initialized:1; u32 visited:1; u32 hotplug_notify:1; u32 is_dock_station:1; u32 of_compatible_ok:1; u32 coherent_dma:1; u32 cca_seen:1; u32 enumeration_by_parent:1; u32 honor_deps:1; u32 reserved:18; }; /* File System */ struct acpi_device_dir { struct proc_dir_entry *entry; }; #define acpi_device_dir(d) ((d)->dir.entry) /* Plug and Play */ typedef char acpi_bus_id[8]; typedef u64 acpi_bus_address; typedef char acpi_device_name[40]; typedef char acpi_device_class[20]; struct acpi_hardware_id { struct list_head list; const char *id; }; struct acpi_pnp_type { u32 hardware_id:1; u32 bus_address:1; u32 platform_id:1; u32 backlight:1; u32 reserved:28; }; struct acpi_device_pnp { acpi_bus_id bus_id; /* Object name */ int instance_no; /* Instance number of this object */ struct acpi_pnp_type type; /* ID type */ acpi_bus_address bus_address; /* _ADR */ char *unique_id; /* _UID */ struct list_head ids; /* _HID and _CIDs */ acpi_device_name device_name; /* Driver-determined */ acpi_device_class device_class; /* " */ union acpi_object *str_obj; /* unicode string for _STR method */ }; #define acpi_device_bid(d) ((d)->pnp.bus_id) #define acpi_device_adr(d) ((d)->pnp.bus_address) const char *acpi_device_hid(struct acpi_device *device); #define acpi_device_uid(d) ((d)->pnp.unique_id) #define acpi_device_name(d) ((d)->pnp.device_name) #define acpi_device_class(d) ((d)->pnp.device_class) /* Power Management */ struct acpi_device_power_flags { u32 explicit_get:1; /* _PSC present? */ u32 power_resources:1; /* Power resources */ u32 inrush_current:1; /* Serialize Dx->D0 */ u32 power_removed:1; /* Optimize Dx->D0 */ u32 ignore_parent:1; /* Power is independent of parent power state */ u32 dsw_present:1; /* _DSW present? */ u32 reserved:26; }; struct acpi_device_power_state { struct { u8 valid:1; u8 explicit_set:1; /* _PSx present? */ u8 reserved:6; } flags; int power; /* % Power (compared to D0) */ int latency; /* Dx->D0 time (microseconds) */ struct list_head resources; /* Power resources referenced */ }; struct acpi_device_power { int state; /* Current state */ struct acpi_device_power_flags flags; struct acpi_device_power_state states[ACPI_D_STATE_COUNT]; /* Power states (D0-D3Cold) */ u8 state_for_enumeration; /* Deepest power state for enumeration */ }; struct acpi_dep_data { struct list_head node; acpi_handle supplier; acpi_handle consumer; bool honor_dep; }; /* Performance Management */ struct acpi_device_perf_flags { u8 reserved:8; }; struct acpi_device_perf_state { struct { u8 valid:1; u8 reserved:7; } flags; u8 power; /* % Power (compared to P0) */ u8 performance; /* % Performance ( " ) */ int latency; /* Px->P0 time (microseconds) */ }; struct acpi_device_perf { int state; struct acpi_device_perf_flags flags; int state_count; struct acpi_device_perf_state *states; }; /* Wakeup Management */ struct acpi_device_wakeup_flags { u8 valid:1; /* Can successfully enable wakeup? */ u8 notifier_present:1; /* Wake-up notify handler has been installed */ }; struct acpi_device_wakeup_context { void (*func)(struct acpi_device_wakeup_context *context); struct device *dev; }; struct acpi_device_wakeup { acpi_handle gpe_device; u64 gpe_number; u64 sleep_state; struct list_head resources; struct acpi_device_wakeup_flags flags; struct acpi_device_wakeup_context context; struct wakeup_source *ws; int prepare_count; int enable_count; }; struct acpi_device_physical_node { unsigned int node_id; struct list_head node; struct device *dev; bool put_online:1; }; struct acpi_device_properties { const guid_t *guid; union acpi_object *properties; struct list_head list; void **bufs; }; /* ACPI Device Specific Data (_DSD) */ struct acpi_device_data { const union acpi_object *pointer; struct list_head properties; const union acpi_object *of_compatible; struct list_head subnodes; }; struct acpi_gpio_mapping; /* Device */ struct acpi_device { u32 pld_crc; int device_type; acpi_handle handle; /* no handle for fixed hardware */ struct fwnode_handle fwnode; struct list_head wakeup_list; struct list_head del_list; struct acpi_device_status status; struct acpi_device_flags flags; struct acpi_device_pnp pnp; struct acpi_device_power power; struct acpi_device_wakeup wakeup; struct acpi_device_perf performance; struct acpi_device_dir dir; struct acpi_device_data data; struct acpi_scan_handler *handler; struct acpi_hotplug_context *hp; const struct acpi_gpio_mapping *driver_gpios; void *driver_data; struct device dev; unsigned int physical_node_count; unsigned int dep_unmet; struct list_head physical_node_list; struct mutex physical_node_lock; void (*remove)(struct acpi_device *); }; /* Non-device subnode */ struct acpi_data_node { const char *name; acpi_handle handle; struct fwnode_handle fwnode; struct fwnode_handle *parent; struct acpi_device_data data; struct list_head sibling; struct kobject kobj; struct completion kobj_done; }; extern const struct fwnode_operations acpi_device_fwnode_ops; extern const struct fwnode_operations acpi_data_fwnode_ops; extern const struct fwnode_operations acpi_static_fwnode_ops; bool is_acpi_device_node(const struct fwnode_handle *fwnode); bool is_acpi_data_node(const struct fwnode_handle *fwnode); static inline bool is_acpi_node(const struct fwnode_handle *fwnode) { return (is_acpi_device_node(fwnode) || is_acpi_data_node(fwnode)); } #define to_acpi_device_node(__fwnode) \ ({ \ typeof(__fwnode) __to_acpi_device_node_fwnode = __fwnode; \ \ is_acpi_device_node(__to_acpi_device_node_fwnode) ? \ container_of(__to_acpi_device_node_fwnode, \ struct acpi_device, fwnode) : \ NULL; \ }) #define to_acpi_data_node(__fwnode) \ ({ \ typeof(__fwnode) __to_acpi_data_node_fwnode = __fwnode; \ \ is_acpi_data_node(__to_acpi_data_node_fwnode) ? \ container_of(__to_acpi_data_node_fwnode, \ struct acpi_data_node, fwnode) : \ NULL; \ }) static inline bool is_acpi_static_node(const struct fwnode_handle *fwnode) { return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &acpi_static_fwnode_ops; } static inline bool acpi_data_node_match(const struct fwnode_handle *fwnode, const char *name) { return is_acpi_data_node(fwnode) ? (!strcmp(to_acpi_data_node(fwnode)->name, name)) : false; } static inline struct fwnode_handle *acpi_fwnode_handle(struct acpi_device *adev) { return &adev->fwnode; } static inline void *acpi_driver_data(struct acpi_device *d) { return d->driver_data; } #define to_acpi_device(d) container_of(d, struct acpi_device, dev) #define to_acpi_driver(d) container_of(d, struct acpi_driver, drv) static inline struct acpi_device *acpi_dev_parent(struct acpi_device *adev) { if (adev->dev.parent) return to_acpi_device(adev->dev.parent); return NULL; } static inline void acpi_set_device_status(struct acpi_device *adev, u32 sta) { *((u32 *)&adev->status) = sta; } static inline void acpi_set_hp_context(struct acpi_device *adev, struct acpi_hotplug_context *hp) { hp->self = adev; adev->hp = hp; } void acpi_initialize_hp_context(struct acpi_device *adev, struct acpi_hotplug_context *hp, int (*notify)(struct acpi_device *, u32), void (*uevent)(struct acpi_device *, u32)); /* acpi_device.dev.bus == &acpi_bus_type */ extern struct bus_type acpi_bus_type; int acpi_bus_for_each_dev(int (*fn)(struct device *, void *), void *data); int acpi_dev_for_each_child(struct acpi_device *adev, int (*fn)(struct acpi_device *, void *), void *data); int acpi_dev_for_each_child_reverse(struct acpi_device *adev, int (*fn)(struct acpi_device *, void *), void *data); /* * Events * ------ */ struct acpi_bus_event { struct list_head node; acpi_device_class device_class; acpi_bus_id bus_id; u32 type; u32 data; }; extern struct kobject *acpi_kobj; extern int acpi_bus_generate_netlink_event(const char*, const char*, u8, int); void acpi_bus_private_data_handler(acpi_handle, void *); int acpi_bus_get_private_data(acpi_handle, void **); int acpi_bus_attach_private_data(acpi_handle, void *); void acpi_bus_detach_private_data(acpi_handle); extern int acpi_notifier_call_chain(struct acpi_device *, u32, u32); extern int register_acpi_notifier(struct notifier_block *); extern int unregister_acpi_notifier(struct notifier_block *); /* * External Functions */ acpi_status acpi_bus_get_status_handle(acpi_handle handle, unsigned long long *sta); int acpi_bus_get_status(struct acpi_device *device); int acpi_bus_set_power(acpi_handle handle, int state); const char *acpi_power_state_string(int state); int acpi_device_set_power(struct acpi_device *device, int state); int acpi_bus_init_power(struct acpi_device *device); int acpi_device_fix_up_power(struct acpi_device *device); void acpi_device_fix_up_power_extended(struct acpi_device *adev); int acpi_bus_update_power(acpi_handle handle, int *state_p); int acpi_device_update_power(struct acpi_device *device, int *state_p); bool acpi_bus_power_manageable(acpi_handle handle); void acpi_dev_power_up_children_with_adr(struct acpi_device *adev); u8 acpi_dev_power_state_for_wake(struct acpi_device *adev); int acpi_device_power_add_dependent(struct acpi_device *adev, struct device *dev); void acpi_device_power_remove_dependent(struct acpi_device *adev, struct device *dev); #ifdef CONFIG_PM bool acpi_bus_can_wakeup(acpi_handle handle); #else static inline bool acpi_bus_can_wakeup(acpi_handle handle) { return false; } #endif void acpi_scan_lock_acquire(void); void acpi_scan_lock_release(void); void acpi_lock_hp_context(void); void acpi_unlock_hp_context(void); int acpi_scan_add_handler(struct acpi_scan_handler *handler); int acpi_bus_register_driver(struct acpi_driver *driver); void acpi_bus_unregister_driver(struct acpi_driver *driver); int acpi_bus_scan(acpi_handle handle); void acpi_bus_trim(struct acpi_device *start); acpi_status acpi_bus_get_ejd(acpi_handle handle, acpi_handle * ejd); int acpi_match_device_ids(struct acpi_device *device, const struct acpi_device_id *ids); void acpi_set_modalias(struct acpi_device *adev, const char *default_id, char *modalias, size_t len); int acpi_create_dir(struct acpi_device *); void acpi_remove_dir(struct acpi_device *); static inline bool acpi_device_enumerated(struct acpi_device *adev) { return adev && adev->flags.initialized && adev->flags.visited; } /** * module_acpi_driver(acpi_driver) - Helper macro for registering an ACPI driver * @__acpi_driver: acpi_driver struct * * Helper macro for ACPI drivers which do not do anything special in module * init/exit. This eliminates a lot of boilerplate. Each module may only * use this macro once, and calling it replaces module_init() and module_exit() */ #define module_acpi_driver(__acpi_driver) \ module_driver(__acpi_driver, acpi_bus_register_driver, \ acpi_bus_unregister_driver) /* * Bind physical devices with ACPI devices */ struct acpi_bus_type { struct list_head list; const char *name; bool (*match)(struct device *dev); struct acpi_device * (*find_companion)(struct device *); void (*setup)(struct device *); }; int register_acpi_bus_type(struct acpi_bus_type *); int unregister_acpi_bus_type(struct acpi_bus_type *); int acpi_bind_one(struct device *dev, struct acpi_device *adev); int acpi_unbind_one(struct device *dev); enum acpi_bridge_type { ACPI_BRIDGE_TYPE_PCIE = 1, ACPI_BRIDGE_TYPE_CXL, }; struct acpi_pci_root { struct acpi_device * device; struct pci_bus *bus; u16 segment; int bridge_type; struct resource secondary; /* downstream bus range */ u32 osc_support_set; /* _OSC state of support bits */ u32 osc_control_set; /* _OSC state of control bits */ u32 osc_ext_support_set; /* _OSC state of extended support bits */ u32 osc_ext_control_set; /* _OSC state of extended control bits */ phys_addr_t mcfg_addr; }; /* helper */ bool acpi_dma_supported(const struct acpi_device *adev); enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev); int acpi_iommu_fwspec_init(struct device *dev, u32 id, struct fwnode_handle *fwnode, const struct iommu_ops *ops); int acpi_dma_get_range(struct device *dev, const struct bus_dma_region **map); int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr, const u32 *input_id); static inline int acpi_dma_configure(struct device *dev, enum dev_dma_attr attr) { return acpi_dma_configure_id(dev, attr, NULL); } struct acpi_device *acpi_find_child_device(struct acpi_device *parent, u64 address, bool check_children); struct acpi_device *acpi_find_child_by_adr(struct acpi_device *adev, acpi_bus_address adr); int acpi_is_root_bridge(acpi_handle); struct acpi_pci_root *acpi_pci_find_root(acpi_handle handle); int acpi_enable_wakeup_device_power(struct acpi_device *dev, int state); int acpi_disable_wakeup_device_power(struct acpi_device *dev); #ifdef CONFIG_X86 bool acpi_device_override_status(struct acpi_device *adev, unsigned long long *status); bool acpi_quirk_skip_acpi_ac_and_battery(void); #else static inline bool acpi_device_override_status(struct acpi_device *adev, unsigned long long *status) { return false; } static inline bool acpi_quirk_skip_acpi_ac_and_battery(void) { return false; } #endif #if IS_ENABLED(CONFIG_X86_ANDROID_TABLETS) bool acpi_quirk_skip_i2c_client_enumeration(struct acpi_device *adev); int acpi_quirk_skip_serdev_enumeration(struct device *controller_parent, bool *skip); #else static inline bool acpi_quirk_skip_i2c_client_enumeration(struct acpi_device *adev) { return false; } static inline int acpi_quirk_skip_serdev_enumeration(struct device *controller_parent, bool *skip) { *skip = false; return 0; } #endif #ifdef CONFIG_PM void acpi_pm_wakeup_event(struct device *dev); acpi_status acpi_add_pm_notifier(struct acpi_device *adev, struct device *dev, void (*func)(struct acpi_device_wakeup_context *context)); acpi_status acpi_remove_pm_notifier(struct acpi_device *adev); bool acpi_pm_device_can_wakeup(struct device *dev); int acpi_pm_device_sleep_state(struct device *, int *, int); int acpi_pm_set_device_wakeup(struct device *dev, bool enable); #else static inline void acpi_pm_wakeup_event(struct device *dev) { } static inline acpi_status acpi_add_pm_notifier(struct acpi_device *adev, struct device *dev, void (*func)(struct acpi_device_wakeup_context *context)) { return AE_SUPPORT; } static inline acpi_status acpi_remove_pm_notifier(struct acpi_device *adev) { return AE_SUPPORT; } static inline bool acpi_pm_device_can_wakeup(struct device *dev) { return false; } static inline int acpi_pm_device_sleep_state(struct device *d, int *p, int m) { if (p) *p = ACPI_STATE_D0; return (m >= ACPI_STATE_D0 && m <= ACPI_STATE_D3_COLD) ? m : ACPI_STATE_D0; } static inline int acpi_pm_set_device_wakeup(struct device *dev, bool enable) { return -ENODEV; } #endif #ifdef CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT bool acpi_sleep_state_supported(u8 sleep_state); #else static inline bool acpi_sleep_state_supported(u8 sleep_state) { return false; } #endif #ifdef CONFIG_ACPI_SLEEP u32 acpi_target_system_state(void); #else static inline u32 acpi_target_system_state(void) { return ACPI_STATE_S0; } #endif static inline bool acpi_device_power_manageable(struct acpi_device *adev) { return adev->flags.power_manageable; } static inline bool acpi_device_can_wakeup(struct acpi_device *adev) { return adev->wakeup.flags.valid; } static inline bool acpi_device_can_poweroff(struct acpi_device *adev) { return adev->power.states[ACPI_STATE_D3_COLD].flags.valid || ((acpi_gbl_FADT.header.revision < 6) && adev->power.states[ACPI_STATE_D3_HOT].flags.explicit_set); } bool acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *uid2); int acpi_dev_uid_to_integer(struct acpi_device *adev, u64 *integer); void acpi_dev_clear_dependencies(struct acpi_device *supplier); bool acpi_dev_ready_for_enumeration(const struct acpi_device *device); struct acpi_device *acpi_dev_get_next_consumer_dev(struct acpi_device *supplier, struct acpi_device *start); /** * for_each_acpi_consumer_dev - iterate over the consumer ACPI devices for a * given supplier * @supplier: Pointer to the supplier's ACPI device * @consumer: Pointer to &struct acpi_device to hold the consumer, initially NULL */ #define for_each_acpi_consumer_dev(supplier, consumer) \ for (consumer = acpi_dev_get_next_consumer_dev(supplier, NULL); \ consumer; \ consumer = acpi_dev_get_next_consumer_dev(supplier, consumer)) struct acpi_device * acpi_dev_get_next_match_dev(struct acpi_device *adev, const char *hid, const char *uid, s64 hrv); struct acpi_device * acpi_dev_get_first_match_dev(const char *hid, const char *uid, s64 hrv); /** * for_each_acpi_dev_match - iterate over ACPI devices that matching the criteria * @adev: pointer to the matching ACPI device, NULL at the end of the loop * @hid: Hardware ID of the device. * @uid: Unique ID of the device, pass NULL to not check _UID * @hrv: Hardware Revision of the device, pass -1 to not check _HRV * * The caller is responsible for invoking acpi_dev_put() on the returned device. */ #define for_each_acpi_dev_match(adev, hid, uid, hrv) \ for (adev = acpi_dev_get_first_match_dev(hid, uid, hrv); \ adev; \ adev = acpi_dev_get_next_match_dev(adev, hid, uid, hrv)) static inline struct acpi_device *acpi_dev_get(struct acpi_device *adev) { return adev ? to_acpi_device(get_device(&adev->dev)) : NULL; } static inline void acpi_dev_put(struct acpi_device *adev) { if (adev) put_device(&adev->dev); } struct acpi_device *acpi_fetch_acpi_dev(acpi_handle handle); struct acpi_device *acpi_get_acpi_dev(acpi_handle handle); static inline void acpi_put_acpi_dev(struct acpi_device *adev) { acpi_dev_put(adev); } #else /* CONFIG_ACPI */ static inline int register_acpi_bus_type(void *bus) { return 0; } static inline int unregister_acpi_bus_type(void *bus) { return 0; } #endif /* CONFIG_ACPI */ #endif /*__ACPI_BUS_H__*/ |
2325 3 6105 2419 101 14 4101 4102 4102 4102 2750 2750 269 362 578 53 5913 5909 5916 5908 4658 4666 4631 57 1676 1066 494 499 225 271 7 1664 5855 4604 4609 4615 1673 901 1323 489 485 492 63 3 1 2 47 46 1 46 46 47 2 46 9181 89 2466 9175 3 5411 5937 2664 5490 2797 2799 2800 2798 9 9 9 43 41 2418 2410 41 41 2413 3 2412 2414 2551 2386 2438 2353 119 214 9 178 178 178 167 14 2478 2476 2477 2476 237 237 237 16 16 16 16 2272 2270 2272 2271 2272 2270 255 2616 256 1 2458 2462 2461 2462 2463 515 261 265 514 5 515 515 515 22 501 10 279 28 278 19 239 514 11 310 206 518 519 519 519 7 7 519 11291 10931 4030 522 518 9 9 522 11863 7190 11280 1 15715 1 15716 85 91 2 89 90 26 91 89 91 90 22 85 91 1 1 2769 2773 2771 2670 223 222 223 148 36 111 224 157 68 224 224 224 224 157 68 224 224 224 7 216 44 180 9 156 155 224 1 3 4 75 1 2 1 66 10 4 31 41 4 36 2 2 2 2 151 459 1 22 1 9 79 87 87 79 1 16 87 100 100 271 26 68 252 252 252 252 34 220 218 1 20 209 219 219 53 1 52 1 52 273 205 32 256 1 130 239 239 14 14 5 2 4 103 103 16 101 14 2271 2270 2274 1 3 2272 90 2260 3 3 2257 19 2271 2274 14 14 14 2420 2 2419 2418 2 1 47 1 100 2257 1 42 29 1 13 6 18 78 1 77 34 44 71 57 5 6 50 1 12 40 7 15 30 29 30 30 59 3 55 9 36 10 24 40 145 7 3 2 76 82 2412 2144 5 304 304 4 7 7 3 6 7 22 112 7 30 76 9 9 26 25 26 2 1 1 1 4 1 3 31 1 1 31 3 1 26 26 26 1 22 19 3 11 3 19 7 1 6 2321 212 2320 1 2317 1 2 2343 2225 298 4033 16 1 3830 207 4021 1815 2291 171 1468 2345 41 2 41 491 3868 3 2 2062 4188 448 3849 2 159 1 3754 458 4152 60 11 124 57 42 7 4031 151 4185 181 85 96 181 362 362 362 957 767 192 14 206 198 203 200 202 200 206 206 198 74 200 74 200 4251 217 4103 2013 2384 4240 3874 4240 131 1 1 2 5 122 4 1 3 2 119 1 120 120 120 119 1 124 60 1 3 25 4 9 26 1 22 15 11 11 54 1 41 3 1 1 17 2 3 1 12 11 11 11 11 11 11 10 16 14 26 26 14 13 10 2 2 23 14 23 12 12 6 18 24 2 9 13 10 4 28 26 24 9 18 5 38 3 4 1 1 1 3 5 2 1 10 3 36 48 38 87 2 1 15 1 61 53 8 61 6 34 38 572 571 221 180 156 50 41 176 175 32 32 32 2464 2461 113 49 93 10 1 9 9 28 1 1 1 24 25 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/namespace.c * * (C) Copyright Al Viro 2000, 2001 * * Based on code from fs/super.c, copyright Linus Torvalds and others. * Heavily rewritten. */ #include <linux/syscalls.h> #include <linux/export.h> #include <linux/capability.h> #include <linux/mnt_namespace.h> #include <linux/user_namespace.h> #include <linux/namei.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/idr.h> #include <linux/init.h> /* init_rootfs */ #include <linux/fs_struct.h> /* get_fs_root et.al. */ #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ #include <linux/file.h> #include <linux/uaccess.h> #include <linux/proc_ns.h> #include <linux/magic.h> #include <linux/memblock.h> #include <linux/proc_fs.h> #include <linux/task_work.h> #include <linux/sched/task.h> #include <uapi/linux/mount.h> #include <linux/fs_context.h> #include <linux/shmem_fs.h> #include <linux/mnt_idmapping.h> #include "pnode.h" #include "internal.h" /* Maximum number of mounts in a mount namespace */ static unsigned int sysctl_mount_max __read_mostly = 100000; static unsigned int m_hash_mask __read_mostly; static unsigned int m_hash_shift __read_mostly; static unsigned int mp_hash_mask __read_mostly; static unsigned int mp_hash_shift __read_mostly; static __initdata unsigned long mhash_entries; static int __init set_mhash_entries(char *str) { if (!str) return 0; mhash_entries = simple_strtoul(str, &str, 0); return 1; } __setup("mhash_entries=", set_mhash_entries); static __initdata unsigned long mphash_entries; static int __init set_mphash_entries(char *str) { if (!str) return 0; mphash_entries = simple_strtoul(str, &str, 0); return 1; } __setup("mphash_entries=", set_mphash_entries); static u64 event; static DEFINE_IDA(mnt_id_ida); static DEFINE_IDA(mnt_group_ida); static struct hlist_head *mount_hashtable __read_mostly; static struct hlist_head *mountpoint_hashtable __read_mostly; static struct kmem_cache *mnt_cache __read_mostly; static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ struct mount_kattr { unsigned int attr_set; unsigned int attr_clr; unsigned int propagation; unsigned int lookup_flags; bool recurse; struct user_namespace *mnt_userns; }; /* /sys/fs */ struct kobject *fs_kobj; EXPORT_SYMBOL_GPL(fs_kobj); /* * vfsmount lock may be taken for read to prevent changes to the * vfsmount hash, ie. during mountpoint lookups or walking back * up the tree. * * It should be taken for write in all cases where the vfsmount * tree or hash is modified or when a vfsmount structure is modified. */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); static inline void lock_mount_hash(void) { write_seqlock(&mount_lock); } static inline void unlock_mount_hash(void) { write_sequnlock(&mount_lock); } static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) { unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); tmp += ((unsigned long)dentry / L1_CACHE_BYTES); tmp = tmp + (tmp >> m_hash_shift); return &mount_hashtable[tmp & m_hash_mask]; } static inline struct hlist_head *mp_hash(struct dentry *dentry) { unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES); tmp = tmp + (tmp >> mp_hash_shift); return &mountpoint_hashtable[tmp & mp_hash_mask]; } static int mnt_alloc_id(struct mount *mnt) { int res = ida_alloc(&mnt_id_ida, GFP_KERNEL); if (res < 0) return res; mnt->mnt_id = res; return 0; } static void mnt_free_id(struct mount *mnt) { ida_free(&mnt_id_ida, mnt->mnt_id); } /* * Allocate a new peer group ID */ static int mnt_alloc_group_id(struct mount *mnt) { int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL); if (res < 0) return res; mnt->mnt_group_id = res; return 0; } /* * Release a peer group ID */ void mnt_release_group_id(struct mount *mnt) { ida_free(&mnt_group_ida, mnt->mnt_group_id); mnt->mnt_group_id = 0; } /* * vfsmount lock must be held for read */ static inline void mnt_add_count(struct mount *mnt, int n) { #ifdef CONFIG_SMP this_cpu_add(mnt->mnt_pcp->mnt_count, n); #else preempt_disable(); mnt->mnt_count += n; preempt_enable(); #endif } /* * vfsmount lock must be held for write */ int mnt_get_count(struct mount *mnt) { #ifdef CONFIG_SMP int count = 0; int cpu; for_each_possible_cpu(cpu) { count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count; } return count; #else return mnt->mnt_count; #endif } static struct mount *alloc_vfsmnt(const char *name) { struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); if (mnt) { int err; err = mnt_alloc_id(mnt); if (err) goto out_free_cache; if (name) { mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL_ACCOUNT); if (!mnt->mnt_devname) goto out_free_id; } #ifdef CONFIG_SMP mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); if (!mnt->mnt_pcp) goto out_free_devname; this_cpu_add(mnt->mnt_pcp->mnt_count, 1); #else mnt->mnt_count = 1; mnt->mnt_writers = 0; #endif INIT_HLIST_NODE(&mnt->mnt_hash); INIT_LIST_HEAD(&mnt->mnt_child); INIT_LIST_HEAD(&mnt->mnt_mounts); INIT_LIST_HEAD(&mnt->mnt_list); INIT_LIST_HEAD(&mnt->mnt_expire); INIT_LIST_HEAD(&mnt->mnt_share); INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); INIT_HLIST_NODE(&mnt->mnt_mp_list); INIT_LIST_HEAD(&mnt->mnt_umounting); INIT_HLIST_HEAD(&mnt->mnt_stuck_children); mnt->mnt.mnt_userns = &init_user_ns; } return mnt; #ifdef CONFIG_SMP out_free_devname: kfree_const(mnt->mnt_devname); #endif out_free_id: mnt_free_id(mnt); out_free_cache: kmem_cache_free(mnt_cache, mnt); return NULL; } /* * Most r/o checks on a fs are for operations that take * discrete amounts of time, like a write() or unlink(). * We must keep track of when those operations start * (for permission checks) and when they end, so that * we can determine when writes are able to occur to * a filesystem. */ /* * __mnt_is_readonly: check whether a mount is read-only * @mnt: the mount to check for its write status * * This shouldn't be used directly ouside of the VFS. * It does not guarantee that the filesystem will stay * r/w, just that it is right *now*. This can not and * should not be used in place of IS_RDONLY(inode). * mnt_want/drop_write() will _keep_ the filesystem * r/w. */ bool __mnt_is_readonly(struct vfsmount *mnt) { return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb); } EXPORT_SYMBOL_GPL(__mnt_is_readonly); static inline void mnt_inc_writers(struct mount *mnt) { #ifdef CONFIG_SMP this_cpu_inc(mnt->mnt_pcp->mnt_writers); #else mnt->mnt_writers++; #endif } static inline void mnt_dec_writers(struct mount *mnt) { #ifdef CONFIG_SMP this_cpu_dec(mnt->mnt_pcp->mnt_writers); #else mnt->mnt_writers--; #endif } static unsigned int mnt_get_writers(struct mount *mnt) { #ifdef CONFIG_SMP unsigned int count = 0; int cpu; for_each_possible_cpu(cpu) { count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers; } return count; #else return mnt->mnt_writers; #endif } static int mnt_is_readonly(struct vfsmount *mnt) { if (mnt->mnt_sb->s_readonly_remount) return 1; /* Order wrt setting s_flags/s_readonly_remount in do_remount() */ smp_rmb(); return __mnt_is_readonly(mnt); } /* * Most r/o & frozen checks on a fs are for operations that take discrete * amounts of time, like a write() or unlink(). We must keep track of when * those operations start (for permission checks) and when they end, so that we * can determine when writes are able to occur to a filesystem. */ /** * __mnt_want_write - get write access to a mount without freeze protection * @m: the mount on which to take a write * * This tells the low-level filesystem that a write is about to be performed to * it, and makes sure that writes are allowed (mnt it read-write) before * returning success. This operation does not protect against filesystem being * frozen. When the write operation is finished, __mnt_drop_write() must be * called. This is effectively a refcount. */ int __mnt_want_write(struct vfsmount *m) { struct mount *mnt = real_mount(m); int ret = 0; preempt_disable(); mnt_inc_writers(mnt); /* * The store to mnt_inc_writers must be visible before we pass * MNT_WRITE_HOLD loop below, so that the slowpath can see our * incremented count after it has set MNT_WRITE_HOLD. */ smp_mb(); might_lock(&mount_lock.lock); while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { cpu_relax(); } else { /* * This prevents priority inversion, if the task * setting MNT_WRITE_HOLD got preempted on a remote * CPU, and it prevents life lock if the task setting * MNT_WRITE_HOLD has a lower priority and is bound to * the same CPU as the task that is spinning here. */ preempt_enable(); lock_mount_hash(); unlock_mount_hash(); preempt_disable(); } } /* * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will * be set to match its requirements. So we must not load that until * MNT_WRITE_HOLD is cleared. */ smp_rmb(); if (mnt_is_readonly(m)) { mnt_dec_writers(mnt); ret = -EROFS; } preempt_enable(); return ret; } /** * mnt_want_write - get write access to a mount * @m: the mount on which to take a write * * This tells the low-level filesystem that a write is about to be performed to * it, and makes sure that writes are allowed (mount is read-write, filesystem * is not frozen) before returning success. When the write operation is * finished, mnt_drop_write() must be called. This is effectively a refcount. */ int mnt_want_write(struct vfsmount *m) { int ret; sb_start_write(m->mnt_sb); ret = __mnt_want_write(m); if (ret) sb_end_write(m->mnt_sb); return ret; } EXPORT_SYMBOL_GPL(mnt_want_write); /** * __mnt_want_write_file - get write access to a file's mount * @file: the file who's mount on which to take a write * * This is like __mnt_want_write, but if the file is already open for writing it * skips incrementing mnt_writers (since the open file already has a reference) * and instead only does the check for emergency r/o remounts. This must be * paired with __mnt_drop_write_file. */ int __mnt_want_write_file(struct file *file) { if (file->f_mode & FMODE_WRITER) { /* * Superblock may have become readonly while there are still * writable fd's, e.g. due to a fs error with errors=remount-ro */ if (__mnt_is_readonly(file->f_path.mnt)) return -EROFS; return 0; } return __mnt_want_write(file->f_path.mnt); } /** * mnt_want_write_file - get write access to a file's mount * @file: the file who's mount on which to take a write * * This is like mnt_want_write, but if the file is already open for writing it * skips incrementing mnt_writers (since the open file already has a reference) * and instead only does the freeze protection and the check for emergency r/o * remounts. This must be paired with mnt_drop_write_file. */ int mnt_want_write_file(struct file *file) { int ret; sb_start_write(file_inode(file)->i_sb); ret = __mnt_want_write_file(file); if (ret) sb_end_write(file_inode(file)->i_sb); return ret; } EXPORT_SYMBOL_GPL(mnt_want_write_file); /** * __mnt_drop_write - give up write access to a mount * @mnt: the mount on which to give up write access * * Tells the low-level filesystem that we are done * performing writes to it. Must be matched with * __mnt_want_write() call above. */ void __mnt_drop_write(struct vfsmount *mnt) { preempt_disable(); mnt_dec_writers(real_mount(mnt)); preempt_enable(); } /** * mnt_drop_write - give up write access to a mount * @mnt: the mount on which to give up write access * * Tells the low-level filesystem that we are done performing writes to it and * also allows filesystem to be frozen again. Must be matched with * mnt_want_write() call above. */ void mnt_drop_write(struct vfsmount *mnt) { __mnt_drop_write(mnt); sb_end_write(mnt->mnt_sb); } EXPORT_SYMBOL_GPL(mnt_drop_write); void __mnt_drop_write_file(struct file *file) { if (!(file->f_mode & FMODE_WRITER)) __mnt_drop_write(file->f_path.mnt); } void mnt_drop_write_file(struct file *file) { __mnt_drop_write_file(file); sb_end_write(file_inode(file)->i_sb); } EXPORT_SYMBOL(mnt_drop_write_file); /** * mnt_hold_writers - prevent write access to the given mount * @mnt: mnt to prevent write access to * * Prevents write access to @mnt if there are no active writers for @mnt. * This function needs to be called and return successfully before changing * properties of @mnt that need to remain stable for callers with write access * to @mnt. * * After this functions has been called successfully callers must pair it with * a call to mnt_unhold_writers() in order to stop preventing write access to * @mnt. * * Context: This function expects lock_mount_hash() to be held serializing * setting MNT_WRITE_HOLD. * Return: On success 0 is returned. * On error, -EBUSY is returned. */ static inline int mnt_hold_writers(struct mount *mnt) { mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; /* * After storing MNT_WRITE_HOLD, we'll read the counters. This store * should be visible before we do. */ smp_mb(); /* * With writers on hold, if this value is zero, then there are * definitely no active writers (although held writers may subsequently * increment the count, they'll have to wait, and decrement it after * seeing MNT_READONLY). * * It is OK to have counter incremented on one CPU and decremented on * another: the sum will add up correctly. The danger would be when we * sum up each counter, if we read a counter before it is incremented, * but then read another CPU's count which it has been subsequently * decremented from -- we would see more decrements than we should. * MNT_WRITE_HOLD protects against this scenario, because * mnt_want_write first increments count, then smp_mb, then spins on * MNT_WRITE_HOLD, so it can't be decremented by another CPU while * we're counting up here. */ if (mnt_get_writers(mnt) > 0) return -EBUSY; return 0; } /** * mnt_unhold_writers - stop preventing write access to the given mount * @mnt: mnt to stop preventing write access to * * Stop preventing write access to @mnt allowing callers to gain write access * to @mnt again. * * This function can only be called after a successful call to * mnt_hold_writers(). * * Context: This function expects lock_mount_hash() to be held. */ static inline void mnt_unhold_writers(struct mount *mnt) { /* * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers * that become unheld will see MNT_READONLY. */ smp_wmb(); mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; } static int mnt_make_readonly(struct mount *mnt) { int ret; ret = mnt_hold_writers(mnt); if (!ret) mnt->mnt.mnt_flags |= MNT_READONLY; mnt_unhold_writers(mnt); return ret; } int sb_prepare_remount_readonly(struct super_block *sb) { struct mount *mnt; int err = 0; /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ if (atomic_long_read(&sb->s_remove_count)) return -EBUSY; lock_mount_hash(); list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { err = mnt_hold_writers(mnt); if (err) break; } } if (!err && atomic_long_read(&sb->s_remove_count)) err = -EBUSY; if (!err) { sb->s_readonly_remount = 1; smp_wmb(); } list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; } unlock_mount_hash(); return err; } static void free_vfsmnt(struct mount *mnt) { struct user_namespace *mnt_userns; mnt_userns = mnt_user_ns(&mnt->mnt); if (!initial_idmapping(mnt_userns)) put_user_ns(mnt_userns); kfree_const(mnt->mnt_devname); #ifdef CONFIG_SMP free_percpu(mnt->mnt_pcp); #endif kmem_cache_free(mnt_cache, mnt); } static void delayed_free_vfsmnt(struct rcu_head *head) { free_vfsmnt(container_of(head, struct mount, mnt_rcu)); } /* call under rcu_read_lock */ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq) { struct mount *mnt; if (read_seqretry(&mount_lock, seq)) return 1; if (bastard == NULL) return 0; mnt = real_mount(bastard); mnt_add_count(mnt, 1); smp_mb(); // see mntput_no_expire() if (likely(!read_seqretry(&mount_lock, seq))) return 0; if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { mnt_add_count(mnt, -1); return 1; } lock_mount_hash(); if (unlikely(bastard->mnt_flags & MNT_DOOMED)) { mnt_add_count(mnt, -1); unlock_mount_hash(); return 1; } unlock_mount_hash(); /* caller will mntput() */ return -1; } /* call under rcu_read_lock */ static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) { int res = __legitimize_mnt(bastard, seq); if (likely(!res)) return true; if (unlikely(res < 0)) { rcu_read_unlock(); mntput(bastard); rcu_read_lock(); } return false; } /* * find the first mount at @dentry on vfsmount @mnt. * call under rcu_read_lock() */ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) { struct hlist_head *head = m_hash(mnt, dentry); struct mount *p; hlist_for_each_entry_rcu(p, head, mnt_hash) if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) return p; return NULL; } /* * lookup_mnt - Return the first child mount mounted at path * * "First" means first mounted chronologically. If you create the * following mounts: * * mount /dev/sda1 /mnt * mount /dev/sda2 /mnt * mount /dev/sda3 /mnt * * Then lookup_mnt() on the base /mnt dentry in the root mount will * return successively the root dentry and vfsmount of /dev/sda1, then * /dev/sda2, then /dev/sda3, then NULL. * * lookup_mnt takes a reference to the found vfsmount. */ struct vfsmount *lookup_mnt(const struct path *path) { struct mount *child_mnt; struct vfsmount *m; unsigned seq; rcu_read_lock(); do { seq = read_seqbegin(&mount_lock); child_mnt = __lookup_mnt(path->mnt, path->dentry); m = child_mnt ? &child_mnt->mnt : NULL; } while (!legitimize_mnt(m, seq)); rcu_read_unlock(); return m; } static inline void lock_ns_list(struct mnt_namespace *ns) { spin_lock(&ns->ns_lock); } static inline void unlock_ns_list(struct mnt_namespace *ns) { spin_unlock(&ns->ns_lock); } static inline bool mnt_is_cursor(struct mount *mnt) { return mnt->mnt.mnt_flags & MNT_CURSOR; } /* * __is_local_mountpoint - Test to see if dentry is a mountpoint in the * current mount namespace. * * The common case is dentries are not mountpoints at all and that * test is handled inline. For the slow case when we are actually * dealing with a mountpoint of some kind, walk through all of the * mounts in the current mount namespace and test to see if the dentry * is a mountpoint. * * The mount_hashtable is not usable in the context because we * need to identify all mounts that may be in the current mount * namespace not just a mount that happens to have some specified * parent mount. */ bool __is_local_mountpoint(struct dentry *dentry) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct mount *mnt; bool is_covered = false; down_read(&namespace_sem); lock_ns_list(ns); list_for_each_entry(mnt, &ns->list, mnt_list) { if (mnt_is_cursor(mnt)) continue; is_covered = (mnt->mnt_mountpoint == dentry); if (is_covered) break; } unlock_ns_list(ns); up_read(&namespace_sem); return is_covered; } static struct mountpoint *lookup_mountpoint(struct dentry *dentry) { struct hlist_head *chain = mp_hash(dentry); struct mountpoint *mp; hlist_for_each_entry(mp, chain, m_hash) { if (mp->m_dentry == dentry) { mp->m_count++; return mp; } } return NULL; } static struct mountpoint *get_mountpoint(struct dentry *dentry) { struct mountpoint *mp, *new = NULL; int ret; if (d_mountpoint(dentry)) { /* might be worth a WARN_ON() */ if (d_unlinked(dentry)) return ERR_PTR(-ENOENT); mountpoint: read_seqlock_excl(&mount_lock); mp = lookup_mountpoint(dentry); read_sequnlock_excl(&mount_lock); if (mp) goto done; } if (!new) new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); if (!new) return ERR_PTR(-ENOMEM); /* Exactly one processes may set d_mounted */ ret = d_set_mounted(dentry); /* Someone else set d_mounted? */ if (ret == -EBUSY) goto mountpoint; /* The dentry is not available as a mountpoint? */ mp = ERR_PTR(ret); if (ret) goto done; /* Add the new mountpoint to the hash table */ read_seqlock_excl(&mount_lock); new->m_dentry = dget(dentry); new->m_count = 1; hlist_add_head(&new->m_hash, mp_hash(dentry)); INIT_HLIST_HEAD(&new->m_list); read_sequnlock_excl(&mount_lock); mp = new; new = NULL; done: kfree(new); return mp; } /* * vfsmount lock must be held. Additionally, the caller is responsible * for serializing calls for given disposal list. */ static void __put_mountpoint(struct mountpoint *mp, struct list_head *list) { if (!--mp->m_count) { struct dentry *dentry = mp->m_dentry; BUG_ON(!hlist_empty(&mp->m_list)); spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_MOUNTED; spin_unlock(&dentry->d_lock); dput_to_list(dentry, list); hlist_del(&mp->m_hash); kfree(mp); } } /* called with namespace_lock and vfsmount lock */ static void put_mountpoint(struct mountpoint *mp) { __put_mountpoint(mp, &ex_mountpoints); } static inline int check_mnt(struct mount *mnt) { return mnt->mnt_ns == current->nsproxy->mnt_ns; } /* * vfsmount lock must be held for write */ static void touch_mnt_namespace(struct mnt_namespace *ns) { if (ns) { ns->event = ++event; wake_up_interruptible(&ns->poll); } } /* * vfsmount lock must be held for write */ static void __touch_mnt_namespace(struct mnt_namespace *ns) { if (ns && ns->event != event) { ns->event = event; wake_up_interruptible(&ns->poll); } } /* * vfsmount lock must be held for write */ static struct mountpoint *unhash_mnt(struct mount *mnt) { struct mountpoint *mp; mnt->mnt_parent = mnt; mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); hlist_del_init_rcu(&mnt->mnt_hash); hlist_del_init(&mnt->mnt_mp_list); mp = mnt->mnt_mp; mnt->mnt_mp = NULL; return mp; } /* * vfsmount lock must be held for write */ static void umount_mnt(struct mount *mnt) { put_mountpoint(unhash_mnt(mnt)); } /* * vfsmount lock must be held for write */ void mnt_set_mountpoint(struct mount *mnt, struct mountpoint *mp, struct mount *child_mnt) { mp->m_count++; mnt_add_count(mnt, 1); /* essentially, that's mntget */ child_mnt->mnt_mountpoint = mp->m_dentry; child_mnt->mnt_parent = mnt; child_mnt->mnt_mp = mp; hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); } static void __attach_mnt(struct mount *mnt, struct mount *parent) { hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mnt->mnt_mountpoint)); list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); } /* * vfsmount lock must be held for write */ static void attach_mnt(struct mount *mnt, struct mount *parent, struct mountpoint *mp) { mnt_set_mountpoint(parent, mp, mnt); __attach_mnt(mnt, parent); } void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt) { struct mountpoint *old_mp = mnt->mnt_mp; struct mount *old_parent = mnt->mnt_parent; list_del_init(&mnt->mnt_child); hlist_del_init(&mnt->mnt_mp_list); hlist_del_init_rcu(&mnt->mnt_hash); attach_mnt(mnt, parent, mp); put_mountpoint(old_mp); mnt_add_count(old_parent, -1); } /* * vfsmount lock must be held for write */ static void commit_tree(struct mount *mnt) { struct mount *parent = mnt->mnt_parent; struct mount *m; LIST_HEAD(head); struct mnt_namespace *n = parent->mnt_ns; BUG_ON(parent == mnt); list_add_tail(&head, &mnt->mnt_list); list_for_each_entry(m, &head, mnt_list) m->mnt_ns = n; list_splice(&head, n->list.prev); n->mounts += n->pending_mounts; n->pending_mounts = 0; __attach_mnt(mnt, parent); touch_mnt_namespace(n); } static struct mount *next_mnt(struct mount *p, struct mount *root) { struct list_head *next = p->mnt_mounts.next; if (next == &p->mnt_mounts) { while (1) { if (p == root) return NULL; next = p->mnt_child.next; if (next != &p->mnt_parent->mnt_mounts) break; p = p->mnt_parent; } } return list_entry(next, struct mount, mnt_child); } static struct mount *skip_mnt_tree(struct mount *p) { struct list_head *prev = p->mnt_mounts.prev; while (prev != &p->mnt_mounts) { p = list_entry(prev, struct mount, mnt_child); prev = p->mnt_mounts.prev; } return p; } /** * vfs_create_mount - Create a mount for a configured superblock * @fc: The configuration context with the superblock attached * * Create a mount to an already configured superblock. If necessary, the * caller should invoke vfs_get_tree() before calling this. * * Note that this does not attach the mount to anything. */ struct vfsmount *vfs_create_mount(struct fs_context *fc) { struct mount *mnt; struct user_namespace *fs_userns; if (!fc->root) return ERR_PTR(-EINVAL); mnt = alloc_vfsmnt(fc->source ?: "none"); if (!mnt) return ERR_PTR(-ENOMEM); if (fc->sb_flags & SB_KERNMOUNT) mnt->mnt.mnt_flags = MNT_INTERNAL; atomic_inc(&fc->root->d_sb->s_active); mnt->mnt.mnt_sb = fc->root->d_sb; mnt->mnt.mnt_root = dget(fc->root); mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; fs_userns = mnt->mnt.mnt_sb->s_user_ns; if (!initial_idmapping(fs_userns)) mnt->mnt.mnt_userns = get_user_ns(fs_userns); lock_mount_hash(); list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts); unlock_mount_hash(); return &mnt->mnt; } EXPORT_SYMBOL(vfs_create_mount); struct vfsmount *fc_mount(struct fs_context *fc) { int err = vfs_get_tree(fc); if (!err) { up_write(&fc->root->d_sb->s_umount); return vfs_create_mount(fc); } return ERR_PTR(err); } EXPORT_SYMBOL(fc_mount); struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) { struct fs_context *fc; struct vfsmount *mnt; int ret = 0; if (!type) return ERR_PTR(-EINVAL); fc = fs_context_for_mount(type, flags); if (IS_ERR(fc)) return ERR_CAST(fc); if (name) ret = vfs_parse_fs_string(fc, "source", name, strlen(name)); if (!ret) ret = parse_monolithic_mount_data(fc, data); if (!ret) mnt = fc_mount(fc); else mnt = ERR_PTR(ret); put_fs_context(fc); return mnt; } EXPORT_SYMBOL_GPL(vfs_kern_mount); struct vfsmount * vfs_submount(const struct dentry *mountpoint, struct file_system_type *type, const char *name, void *data) { /* Until it is worked out how to pass the user namespace * through from the parent mount to the submount don't support * unprivileged mounts with submounts. */ if (mountpoint->d_sb->s_user_ns != &init_user_ns) return ERR_PTR(-EPERM); return vfs_kern_mount(type, SB_SUBMOUNT, name, data); } EXPORT_SYMBOL_GPL(vfs_submount); static struct mount *clone_mnt(struct mount *old, struct dentry *root, int flag) { struct super_block *sb = old->mnt.mnt_sb; struct mount *mnt; int err; mnt = alloc_vfsmnt(old->mnt_devname); if (!mnt) return ERR_PTR(-ENOMEM); if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE)) mnt->mnt_group_id = 0; /* not a peer of original */ else mnt->mnt_group_id = old->mnt_group_id; if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { err = mnt_alloc_group_id(mnt); if (err) goto out_free; } mnt->mnt.mnt_flags = old->mnt.mnt_flags; mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); atomic_inc(&sb->s_active); mnt->mnt.mnt_userns = mnt_user_ns(&old->mnt); if (!initial_idmapping(mnt->mnt.mnt_userns)) mnt->mnt.mnt_userns = get_user_ns(mnt->mnt.mnt_userns); mnt->mnt.mnt_sb = sb; mnt->mnt.mnt_root = dget(root); mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; lock_mount_hash(); list_add_tail(&mnt->mnt_instance, &sb->s_mounts); unlock_mount_hash(); if ((flag & CL_SLAVE) || ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { list_add(&mnt->mnt_slave, &old->mnt_slave_list); mnt->mnt_master = old; CLEAR_MNT_SHARED(mnt); } else if (!(flag & CL_PRIVATE)) { if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) list_add(&mnt->mnt_share, &old->mnt_share); if (IS_MNT_SLAVE(old)) list_add(&mnt->mnt_slave, &old->mnt_slave); mnt->mnt_master = old->mnt_master; } else { CLEAR_MNT_SHARED(mnt); } if (flag & CL_MAKE_SHARED) set_mnt_shared(mnt); /* stick the duplicate mount on the same expiry list * as the original if that was on one */ if (flag & CL_EXPIRE) { if (!list_empty(&old->mnt_expire)) list_add(&mnt->mnt_expire, &old->mnt_expire); } return mnt; out_free: mnt_free_id(mnt); free_vfsmnt(mnt); return ERR_PTR(err); } static void cleanup_mnt(struct mount *mnt) { struct hlist_node *p; struct mount *m; /* * The warning here probably indicates that somebody messed * up a mnt_want/drop_write() pair. If this happens, the * filesystem was probably unable to make r/w->r/o transitions. * The locking used to deal with mnt_count decrement provides barriers, * so mnt_get_writers() below is safe. */ WARN_ON(mnt_get_writers(mnt)); if (unlikely(mnt->mnt_pins.first)) mnt_pin_kill(mnt); hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) { hlist_del(&m->mnt_umount); mntput(&m->mnt); } fsnotify_vfsmount_delete(&mnt->mnt); dput(mnt->mnt.mnt_root); deactivate_super(mnt->mnt.mnt_sb); mnt_free_id(mnt); call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); } static void __cleanup_mnt(struct rcu_head *head) { cleanup_mnt(container_of(head, struct mount, mnt_rcu)); } static LLIST_HEAD(delayed_mntput_list); static void delayed_mntput(struct work_struct *unused) { struct llist_node *node = llist_del_all(&delayed_mntput_list); struct mount *m, *t; llist_for_each_entry_safe(m, t, node, mnt_llist) cleanup_mnt(m); } static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); static void mntput_no_expire(struct mount *mnt) { LIST_HEAD(list); int count; rcu_read_lock(); if (likely(READ_ONCE(mnt->mnt_ns))) { /* * Since we don't do lock_mount_hash() here, * ->mnt_ns can change under us. However, if it's * non-NULL, then there's a reference that won't * be dropped until after an RCU delay done after * turning ->mnt_ns NULL. So if we observe it * non-NULL under rcu_read_lock(), the reference * we are dropping is not the final one. */ mnt_add_count(mnt, -1); rcu_read_unlock(); return; } lock_mount_hash(); /* * make sure that if __legitimize_mnt() has not seen us grab * mount_lock, we'll see their refcount increment here. */ smp_mb(); mnt_add_count(mnt, -1); count = mnt_get_count(mnt); if (count != 0) { WARN_ON(count < 0); rcu_read_unlock(); unlock_mount_hash(); return; } if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { rcu_read_unlock(); unlock_mount_hash(); return; } mnt->mnt.mnt_flags |= MNT_DOOMED; rcu_read_unlock(); list_del(&mnt->mnt_instance); if (unlikely(!list_empty(&mnt->mnt_mounts))) { struct mount *p, *tmp; list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) { __put_mountpoint(unhash_mnt(p), &list); hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children); } } unlock_mount_hash(); shrink_dentry_list(&list); if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) { struct task_struct *task = current; if (likely(!(task->flags & PF_KTHREAD))) { init_task_work(&mnt->mnt_rcu, __cleanup_mnt); if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME)) return; } if (llist_add(&mnt->mnt_llist, &delayed_mntput_list)) schedule_delayed_work(&delayed_mntput_work, 1); return; } cleanup_mnt(mnt); } void mntput(struct vfsmount *mnt) { if (mnt) { struct mount *m = real_mount(mnt); /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ if (unlikely(m->mnt_expiry_mark)) m->mnt_expiry_mark = 0; mntput_no_expire(m); } } EXPORT_SYMBOL(mntput); struct vfsmount *mntget(struct vfsmount *mnt) { if (mnt) mnt_add_count(real_mount(mnt), 1); return mnt; } EXPORT_SYMBOL(mntget); /** * path_is_mountpoint() - Check if path is a mount in the current namespace. * @path: path to check * * d_mountpoint() can only be used reliably to establish if a dentry is * not mounted in any namespace and that common case is handled inline. * d_mountpoint() isn't aware of the possibility there may be multiple * mounts using a given dentry in a different namespace. This function * checks if the passed in path is a mountpoint rather than the dentry * alone. */ bool path_is_mountpoint(const struct path *path) { unsigned seq; bool res; if (!d_mountpoint(path->dentry)) return false; rcu_read_lock(); do { seq = read_seqbegin(&mount_lock); res = __path_is_mountpoint(path); } while (read_seqretry(&mount_lock, seq)); rcu_read_unlock(); return res; } EXPORT_SYMBOL(path_is_mountpoint); struct vfsmount *mnt_clone_internal(const struct path *path) { struct mount *p; p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE); if (IS_ERR(p)) return ERR_CAST(p); p->mnt.mnt_flags |= MNT_INTERNAL; return &p->mnt; } #ifdef CONFIG_PROC_FS static struct mount *mnt_list_next(struct mnt_namespace *ns, struct list_head *p) { struct mount *mnt, *ret = NULL; lock_ns_list(ns); list_for_each_continue(p, &ns->list) { mnt = list_entry(p, typeof(*mnt), mnt_list); if (!mnt_is_cursor(mnt)) { ret = mnt; break; } } unlock_ns_list(ns); return ret; } /* iterator; we want it to have access to namespace_sem, thus here... */ static void *m_start(struct seq_file *m, loff_t *pos) { struct proc_mounts *p = m->private; struct list_head *prev; down_read(&namespace_sem); if (!*pos) { prev = &p->ns->list; } else { prev = &p->cursor.mnt_list; /* Read after we'd reached the end? */ if (list_empty(prev)) return NULL; } return mnt_list_next(p->ns, prev); } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { struct proc_mounts *p = m->private; struct mount *mnt = v; ++*pos; return mnt_list_next(p->ns, &mnt->mnt_list); } static void m_stop(struct seq_file *m, void *v) { struct proc_mounts *p = m->private; struct mount *mnt = v; lock_ns_list(p->ns); if (mnt) list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list); else list_del_init(&p->cursor.mnt_list); unlock_ns_list(p->ns); up_read(&namespace_sem); } static int m_show(struct seq_file *m, void *v) { struct proc_mounts *p = m->private; struct mount *r = v; return p->show(m, &r->mnt); } const struct seq_operations mounts_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = m_show, }; void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor) { down_read(&namespace_sem); lock_ns_list(ns); list_del(&cursor->mnt_list); unlock_ns_list(ns); up_read(&namespace_sem); } #endif /* CONFIG_PROC_FS */ /** * may_umount_tree - check if a mount tree is busy * @m: root of mount tree * * This is called to check if a tree of mounts has any * open files, pwds, chroots or sub mounts that are * busy. */ int may_umount_tree(struct vfsmount *m) { struct mount *mnt = real_mount(m); int actual_refs = 0; int minimum_refs = 0; struct mount *p; BUG_ON(!m); /* write lock needed for mnt_get_count */ lock_mount_hash(); for (p = mnt; p; p = next_mnt(p, mnt)) { actual_refs += mnt_get_count(p); minimum_refs += 2; } unlock_mount_hash(); if (actual_refs > minimum_refs) return 0; return 1; } EXPORT_SYMBOL(may_umount_tree); /** * may_umount - check if a mount point is busy * @mnt: root of mount * * This is called to check if a mount point has any * open files, pwds, chroots or sub mounts. If the * mount has sub mounts this will return busy * regardless of whether the sub mounts are busy. * * Doesn't take quota and stuff into account. IOW, in some cases it will * give false negatives. The main reason why it's here is that we need * a non-destructive way to look for easily umountable filesystems. */ int may_umount(struct vfsmount *mnt) { int ret = 1; down_read(&namespace_sem); lock_mount_hash(); if (propagate_mount_busy(real_mount(mnt), 2)) ret = 0; unlock_mount_hash(); up_read(&namespace_sem); return ret; } EXPORT_SYMBOL(may_umount); static void namespace_unlock(void) { struct hlist_head head; struct hlist_node *p; struct mount *m; LIST_HEAD(list); hlist_move_list(&unmounted, &head); list_splice_init(&ex_mountpoints, &list); up_write(&namespace_sem); shrink_dentry_list(&list); if (likely(hlist_empty(&head))) return; synchronize_rcu_expedited(); hlist_for_each_entry_safe(m, p, &head, mnt_umount) { hlist_del(&m->mnt_umount); mntput(&m->mnt); } } static inline void namespace_lock(void) { down_write(&namespace_sem); } enum umount_tree_flags { UMOUNT_SYNC = 1, UMOUNT_PROPAGATE = 2, UMOUNT_CONNECTED = 4, }; static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how) { /* Leaving mounts connected is only valid for lazy umounts */ if (how & UMOUNT_SYNC) return true; /* A mount without a parent has nothing to be connected to */ if (!mnt_has_parent(mnt)) return true; /* Because the reference counting rules change when mounts are * unmounted and connected, umounted mounts may not be * connected to mounted mounts. */ if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT)) return true; /* Has it been requested that the mount remain connected? */ if (how & UMOUNT_CONNECTED) return false; /* Is the mount locked such that it needs to remain connected? */ if (IS_MNT_LOCKED(mnt)) return false; /* By default disconnect the mount */ return true; } /* * mount_lock must be held * namespace_sem must be held for write */ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) { LIST_HEAD(tmp_list); struct mount *p; if (how & UMOUNT_PROPAGATE) propagate_mount_unlock(mnt); /* Gather the mounts to umount */ for (p = mnt; p; p = next_mnt(p, mnt)) { p->mnt.mnt_flags |= MNT_UMOUNT; list_move(&p->mnt_list, &tmp_list); } /* Hide the mounts from mnt_mounts */ list_for_each_entry(p, &tmp_list, mnt_list) { list_del_init(&p->mnt_child); } /* Add propogated mounts to the tmp_list */ if (how & UMOUNT_PROPAGATE) propagate_umount(&tmp_list); while (!list_empty(&tmp_list)) { struct mnt_namespace *ns; bool disconnect; p = list_first_entry(&tmp_list, struct mount, mnt_list); list_del_init(&p->mnt_expire); list_del_init(&p->mnt_list); ns = p->mnt_ns; if (ns) { ns->mounts--; __touch_mnt_namespace(ns); } p->mnt_ns = NULL; if (how & UMOUNT_SYNC) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; disconnect = disconnect_mount(p, how); if (mnt_has_parent(p)) { mnt_add_count(p->mnt_parent, -1); if (!disconnect) { /* Don't forget about p */ list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts); } else { umount_mnt(p); } } change_mnt_propagation(p, MS_PRIVATE); if (disconnect) hlist_add_head(&p->mnt_umount, &unmounted); } } static void shrink_submounts(struct mount *mnt); static int do_umount_root(struct super_block *sb) { int ret = 0; down_write(&sb->s_umount); if (!sb_rdonly(sb)) { struct fs_context *fc; fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY, SB_RDONLY); if (IS_ERR(fc)) { ret = PTR_ERR(fc); } else { ret = parse_monolithic_mount_data(fc, NULL); if (!ret) ret = reconfigure_super(fc); put_fs_context(fc); } } up_write(&sb->s_umount); return ret; } static int do_umount(struct mount *mnt, int flags) { struct super_block *sb = mnt->mnt.mnt_sb; int retval; retval = security_sb_umount(&mnt->mnt, flags); if (retval) return retval; /* * Allow userspace to request a mountpoint be expired rather than * unmounting unconditionally. Unmount only happens if: * (1) the mark is already set (the mark is cleared by mntput()) * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] */ if (flags & MNT_EXPIRE) { if (&mnt->mnt == current->fs->root.mnt || flags & (MNT_FORCE | MNT_DETACH)) return -EINVAL; /* * probably don't strictly need the lock here if we examined * all race cases, but it's a slowpath. */ lock_mount_hash(); if (mnt_get_count(mnt) != 2) { unlock_mount_hash(); return -EBUSY; } unlock_mount_hash(); if (!xchg(&mnt->mnt_expiry_mark, 1)) return -EAGAIN; } /* * If we may have to abort operations to get out of this * mount, and they will themselves hold resources we must * allow the fs to do things. In the Unix tradition of * 'Gee thats tricky lets do it in userspace' the umount_begin * might fail to complete on the first run through as other tasks * must return, and the like. Thats for the mount program to worry * about for the moment. */ if (flags & MNT_FORCE && sb->s_op->umount_begin) { sb->s_op->umount_begin(sb); } /* * No sense to grab the lock for this test, but test itself looks * somewhat bogus. Suggestions for better replacement? * Ho-hum... In principle, we might treat that as umount + switch * to rootfs. GC would eventually take care of the old vfsmount. * Actually it makes sense, especially if rootfs would contain a * /reboot - static binary that would close all descriptors and * call reboot(9). Then init(8) could umount root and exec /reboot. */ if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { /* * Special case for "unmounting" root ... * we just try to remount it readonly. */ if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; return do_umount_root(sb); } namespace_lock(); lock_mount_hash(); /* Recheck MNT_LOCKED with the locks held */ retval = -EINVAL; if (mnt->mnt.mnt_flags & MNT_LOCKED) goto out; event++; if (flags & MNT_DETACH) { if (!list_empty(&mnt->mnt_list)) umount_tree(mnt, UMOUNT_PROPAGATE); retval = 0; } else { shrink_submounts(mnt); retval = -EBUSY; if (!propagate_mount_busy(mnt, 2)) { if (!list_empty(&mnt->mnt_list)) umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); retval = 0; } } out: unlock_mount_hash(); namespace_unlock(); return retval; } /* * __detach_mounts - lazily unmount all mounts on the specified dentry * * During unlink, rmdir, and d_drop it is possible to loose the path * to an existing mountpoint, and wind up leaking the mount. * detach_mounts allows lazily unmounting those mounts instead of * leaking them. * * The caller may hold dentry->d_inode->i_mutex. */ void __detach_mounts(struct dentry *dentry) { struct mountpoint *mp; struct mount *mnt; namespace_lock(); lock_mount_hash(); mp = lookup_mountpoint(dentry); if (!mp) goto out_unlock; event++; while (!hlist_empty(&mp->m_list)) { mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); if (mnt->mnt.mnt_flags & MNT_UMOUNT) { umount_mnt(mnt); hlist_add_head(&mnt->mnt_umount, &unmounted); } else umount_tree(mnt, UMOUNT_CONNECTED); } put_mountpoint(mp); out_unlock: unlock_mount_hash(); namespace_unlock(); } /* * Is the caller allowed to modify his namespace? */ bool may_mount(void) { return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); } static void warn_mandlock(void) { pr_warn_once("=======================================================\n" "WARNING: The mand mount option has been deprecated and\n" " and is ignored by this kernel. Remove the mand\n" " option from the mount to silence this warning.\n" "=======================================================\n"); } static int can_umount(const struct path *path, int flags) { struct mount *mnt = real_mount(path->mnt); if (!may_mount()) return -EPERM; if (path->dentry != path->mnt->mnt_root) return -EINVAL; if (!check_mnt(mnt)) return -EINVAL; if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */ return -EINVAL; if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN)) return -EPERM; return 0; } // caller is responsible for flags being sane int path_umount(struct path *path, int flags) { struct mount *mnt = real_mount(path->mnt); int ret; ret = can_umount(path, flags); if (!ret) ret = do_umount(mnt, flags); /* we mustn't call path_put() as that would clear mnt_expiry_mark */ dput(path->dentry); mntput_no_expire(mnt); return ret; } static int ksys_umount(char __user *name, int flags) { int lookup_flags = LOOKUP_MOUNTPOINT; struct path path; int ret; // basic validity checks done first if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) return -EINVAL; if (!(flags & UMOUNT_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; ret = user_path_at(AT_FDCWD, name, lookup_flags, &path); if (ret) return ret; return path_umount(&path, flags); } SYSCALL_DEFINE2(umount, char __user *, name, int, flags) { return ksys_umount(name, flags); } #ifdef __ARCH_WANT_SYS_OLDUMOUNT /* * The 2.0 compatible umount. No flags. */ SYSCALL_DEFINE1(oldumount, char __user *, name) { return ksys_umount(name, 0); } #endif static bool is_mnt_ns_file(struct dentry *dentry) { /* Is this a proxy for a mount namespace? */ return dentry->d_op == &ns_dentry_operations && dentry->d_fsdata == &mntns_operations; } static struct mnt_namespace *to_mnt_ns(struct ns_common *ns) { return container_of(ns, struct mnt_namespace, ns); } struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) { return &mnt->ns; } static bool mnt_ns_loop(struct dentry *dentry) { /* Could bind mounting the mount namespace inode cause a * mount namespace loop? */ struct mnt_namespace *mnt_ns; if (!is_mnt_ns_file(dentry)) return false; mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode)); return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; } struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, int flag) { struct mount *res, *p, *q, *r, *parent; if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt)) return ERR_PTR(-EINVAL); if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry)) return ERR_PTR(-EINVAL); res = q = clone_mnt(mnt, dentry, flag); if (IS_ERR(q)) return q; q->mnt_mountpoint = mnt->mnt_mountpoint; p = mnt; list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { struct mount *s; if (!is_subdir(r->mnt_mountpoint, dentry)) continue; for (s = r; s; s = next_mnt(s, r)) { if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(s)) { if (s->mnt.mnt_flags & MNT_LOCKED) { /* Both unbindable and locked. */ q = ERR_PTR(-EPERM); goto out; } else { s = skip_mnt_tree(s); continue; } } if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(s->mnt.mnt_root)) { s = skip_mnt_tree(s); continue; } while (p != s->mnt_parent) { p = p->mnt_parent; q = q->mnt_parent; } p = s; parent = q; q = clone_mnt(p, p->mnt.mnt_root, flag); if (IS_ERR(q)) goto out; lock_mount_hash(); list_add_tail(&q->mnt_list, &res->mnt_list); attach_mnt(q, parent, p->mnt_mp); unlock_mount_hash(); } } return res; out: if (res) { lock_mount_hash(); umount_tree(res, UMOUNT_SYNC); unlock_mount_hash(); } return q; } /* Caller should check returned pointer for errors */ struct vfsmount *collect_mounts(const struct path *path) { struct mount *tree; namespace_lock(); if (!check_mnt(real_mount(path->mnt))) tree = ERR_PTR(-EINVAL); else tree = copy_tree(real_mount(path->mnt), path->dentry, CL_COPY_ALL | CL_PRIVATE); namespace_unlock(); if (IS_ERR(tree)) return ERR_CAST(tree); return &tree->mnt; } static void free_mnt_ns(struct mnt_namespace *); static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool); void dissolve_on_fput(struct vfsmount *mnt) { struct mnt_namespace *ns; namespace_lock(); lock_mount_hash(); ns = real_mount(mnt)->mnt_ns; if (ns) { if (is_anon_ns(ns)) umount_tree(real_mount(mnt), UMOUNT_CONNECTED); else ns = NULL; } unlock_mount_hash(); namespace_unlock(); if (ns) free_mnt_ns(ns); } void drop_collected_mounts(struct vfsmount *mnt) { namespace_lock(); lock_mount_hash(); umount_tree(real_mount(mnt), 0); unlock_mount_hash(); namespace_unlock(); } static bool has_locked_children(struct mount *mnt, struct dentry *dentry) { struct mount *child; list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { if (!is_subdir(child->mnt_mountpoint, dentry)) continue; if (child->mnt.mnt_flags & MNT_LOCKED) return true; } return false; } /** * clone_private_mount - create a private clone of a path * @path: path to clone * * This creates a new vfsmount, which will be the clone of @path. The new mount * will not be attached anywhere in the namespace and will be private (i.e. * changes to the originating mount won't be propagated into this). * * Release with mntput(). */ struct vfsmount *clone_private_mount(const struct path *path) { struct mount *old_mnt = real_mount(path->mnt); struct mount *new_mnt; down_read(&namespace_sem); if (IS_MNT_UNBINDABLE(old_mnt)) goto invalid; if (!check_mnt(old_mnt)) goto invalid; if (has_locked_children(old_mnt, path->dentry)) goto invalid; new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); up_read(&namespace_sem); if (IS_ERR(new_mnt)) return ERR_CAST(new_mnt); /* Longterm mount to be removed by kern_unmount*() */ new_mnt->mnt_ns = MNT_NS_INTERNAL; return &new_mnt->mnt; invalid: up_read(&namespace_sem); return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(clone_private_mount); int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, struct vfsmount *root) { struct mount *mnt; int res = f(root, arg); if (res) return res; list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) { res = f(&mnt->mnt, arg); if (res) return res; } return 0; } static void lock_mnt_tree(struct mount *mnt) { struct mount *p; for (p = mnt; p; p = next_mnt(p, mnt)) { int flags = p->mnt.mnt_flags; /* Don't allow unprivileged users to change mount flags */ flags |= MNT_LOCK_ATIME; if (flags & MNT_READONLY) flags |= MNT_LOCK_READONLY; if (flags & MNT_NODEV) flags |= MNT_LOCK_NODEV; if (flags & MNT_NOSUID) flags |= MNT_LOCK_NOSUID; if (flags & MNT_NOEXEC) flags |= MNT_LOCK_NOEXEC; /* Don't allow unprivileged users to reveal what is under a mount */ if (list_empty(&p->mnt_expire)) flags |= MNT_LOCKED; p->mnt.mnt_flags = flags; } } static void cleanup_group_ids(struct mount *mnt, struct mount *end) { struct mount *p; for (p = mnt; p != end; p = next_mnt(p, mnt)) { if (p->mnt_group_id && !IS_MNT_SHARED(p)) mnt_release_group_id(p); } } static int invent_group_ids(struct mount *mnt, bool recurse) { struct mount *p; for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { int err = mnt_alloc_group_id(p); if (err) { cleanup_group_ids(mnt, p); return err; } } } return 0; } int count_mounts(struct mnt_namespace *ns, struct mount *mnt) { unsigned int max = READ_ONCE(sysctl_mount_max); unsigned int mounts = 0; struct mount *p; if (ns->mounts >= max) return -ENOSPC; max -= ns->mounts; if (ns->pending_mounts >= max) return -ENOSPC; max -= ns->pending_mounts; for (p = mnt; p; p = next_mnt(p, mnt)) mounts++; if (mounts > max) return -ENOSPC; ns->pending_mounts += mounts; return 0; } /* * @source_mnt : mount tree to be attached * @nd : place the mount tree @source_mnt is attached * @parent_nd : if non-null, detach the source_mnt from its parent and * store the parent mount and mountpoint dentry. * (done when source_mnt is moved) * * NOTE: in the table below explains the semantics when a source mount * of a given type is attached to a destination mount of a given type. * --------------------------------------------------------------------------- * | BIND MOUNT OPERATION | * |************************************************************************** * | source-->| shared | private | slave | unbindable | * | dest | | | | | * | | | | | | | * | v | | | | | * |************************************************************************** * | shared | shared (++) | shared (+) | shared(+++)| invalid | * | | | | | | * |non-shared| shared (+) | private | slave (*) | invalid | * *************************************************************************** * A bind operation clones the source mount and mounts the clone on the * destination mount. * * (++) the cloned mount is propagated to all the mounts in the propagation * tree of the destination mount and the cloned mount is added to * the peer group of the source mount. * (+) the cloned mount is created under the destination mount and is marked * as shared. The cloned mount is added to the peer group of the source * mount. * (+++) the mount is propagated to all the mounts in the propagation tree * of the destination mount and the cloned mount is made slave * of the same master as that of the source mount. The cloned mount * is marked as 'shared and slave'. * (*) the cloned mount is made a slave of the same master as that of the * source mount. * * --------------------------------------------------------------------------- * | MOVE MOUNT OPERATION | * |************************************************************************** * | source-->| shared | private | slave | unbindable | * | dest | | | | | * | | | | | | | * | v | | | | | * |************************************************************************** * | shared | shared (+) | shared (+) | shared(+++) | invalid | * | | | | | | * |non-shared| shared (+*) | private | slave (*) | unbindable | * *************************************************************************** * * (+) the mount is moved to the destination. And is then propagated to * all the mounts in the propagation tree of the destination mount. * (+*) the mount is moved to the destination. * (+++) the mount is moved to the destination and is then propagated to * all the mounts belonging to the destination mount's propagation tree. * the mount is marked as 'shared and slave'. * (*) the mount continues to be a slave at the new location. * * if the source mount is a tree, the operations explained above is * applied to each mount in the tree. * Must be called without spinlocks held, since this function can sleep * in allocations. */ static int attach_recursive_mnt(struct mount *source_mnt, struct mount *dest_mnt, struct mountpoint *dest_mp, bool moving) { struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; HLIST_HEAD(tree_list); struct mnt_namespace *ns = dest_mnt->mnt_ns; struct mountpoint *smp; struct mount *child, *p; struct hlist_node *n; int err; /* Preallocate a mountpoint in case the new mounts need * to be tucked under other mounts. */ smp = get_mountpoint(source_mnt->mnt.mnt_root); if (IS_ERR(smp)) return PTR_ERR(smp); /* Is there space to add these mounts to the mount namespace? */ if (!moving) { err = count_mounts(ns, source_mnt); if (err) goto out; } if (IS_MNT_SHARED(dest_mnt)) { err = invent_group_ids(source_mnt, true); if (err) goto out; err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); lock_mount_hash(); if (err) goto out_cleanup_ids; for (p = source_mnt; p; p = next_mnt(p, source_mnt)) set_mnt_shared(p); } else { lock_mount_hash(); } if (moving) { unhash_mnt(source_mnt); attach_mnt(source_mnt, dest_mnt, dest_mp); touch_mnt_namespace(source_mnt->mnt_ns); } else { if (source_mnt->mnt_ns) { /* move from anon - the caller will destroy */ list_del_init(&source_mnt->mnt_ns->list); } mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); commit_tree(source_mnt); } hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { struct mount *q; hlist_del_init(&child->mnt_hash); q = __lookup_mnt(&child->mnt_parent->mnt, child->mnt_mountpoint); if (q) mnt_change_mountpoint(child, smp, q); /* Notice when we are propagating across user namespaces */ if (child->mnt_parent->mnt_ns->user_ns != user_ns) lock_mnt_tree(child); child->mnt.mnt_flags &= ~MNT_LOCKED; commit_tree(child); } put_mountpoint(smp); unlock_mount_hash(); return 0; out_cleanup_ids: while (!hlist_empty(&tree_list)) { child = hlist_entry(tree_list.first, struct mount, mnt_hash); child->mnt_parent->mnt_ns->pending_mounts = 0; umount_tree(child, UMOUNT_SYNC); } unlock_mount_hash(); cleanup_group_ids(source_mnt, NULL); out: ns->pending_mounts = 0; read_seqlock_excl(&mount_lock); put_mountpoint(smp); read_sequnlock_excl(&mount_lock); return err; } static struct mountpoint *lock_mount(struct path *path) { struct vfsmount *mnt; struct dentry *dentry = path->dentry; retry: inode_lock(dentry->d_inode); if (unlikely(cant_mount(dentry))) { inode_unlock(dentry->d_inode); return ERR_PTR(-ENOENT); } namespace_lock(); mnt = lookup_mnt(path); if (likely(!mnt)) { struct mountpoint *mp = get_mountpoint(dentry); if (IS_ERR(mp)) { namespace_unlock(); inode_unlock(dentry->d_inode); return mp; } return mp; } namespace_unlock(); inode_unlock(path->dentry->d_inode); path_put(path); path->mnt = mnt; dentry = path->dentry = dget(mnt->mnt_root); goto retry; } static void unlock_mount(struct mountpoint *where) { struct dentry *dentry = where->m_dentry; read_seqlock_excl(&mount_lock); put_mountpoint(where); read_sequnlock_excl(&mount_lock); namespace_unlock(); inode_unlock(dentry->d_inode); } static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) { if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER) return -EINVAL; if (d_is_dir(mp->m_dentry) != d_is_dir(mnt->mnt.mnt_root)) return -ENOTDIR; return attach_recursive_mnt(mnt, p, mp, false); } /* * Sanity check the flags to change_mnt_propagation. */ static int flags_to_propagation_type(int ms_flags) { int type = ms_flags & ~(MS_REC | MS_SILENT); /* Fail if any non-propagation flags are set */ if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) return 0; /* Only one propagation flag should be set */ if (!is_power_of_2(type)) return 0; return type; } /* * recursively change the type of the mountpoint. */ static int do_change_type(struct path *path, int ms_flags) { struct mount *m; struct mount *mnt = real_mount(path->mnt); int recurse = ms_flags & MS_REC; int type; int err = 0; if (path->dentry != path->mnt->mnt_root) return -EINVAL; type = flags_to_propagation_type(ms_flags); if (!type) return -EINVAL; namespace_lock(); if (type == MS_SHARED) { err = invent_group_ids(mnt, recurse); if (err) goto out_unlock; } lock_mount_hash(); for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) change_mnt_propagation(m, type); unlock_mount_hash(); out_unlock: namespace_unlock(); return err; } static struct mount *__do_loopback(struct path *old_path, int recurse) { struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt); if (IS_MNT_UNBINDABLE(old)) return mnt; if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations) return mnt; if (!recurse && has_locked_children(old, old_path->dentry)) return mnt; if (recurse) mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE); else mnt = clone_mnt(old, old_path->dentry, 0); if (!IS_ERR(mnt)) mnt->mnt.mnt_flags &= ~MNT_LOCKED; return mnt; } /* * do loopback mount. */ static int do_loopback(struct path *path, const char *old_name, int recurse) { struct path old_path; struct mount *mnt = NULL, *parent; struct mountpoint *mp; int err; if (!old_name || !*old_name) return -EINVAL; err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); if (err) return err; err = -EINVAL; if (mnt_ns_loop(old_path.dentry)) goto out; mp = lock_mount(path); if (IS_ERR(mp)) { err = PTR_ERR(mp); goto out; } parent = real_mount(path->mnt); if (!check_mnt(parent)) goto out2; mnt = __do_loopback(&old_path, recurse); if (IS_ERR(mnt)) { err = PTR_ERR(mnt); goto out2; } err = graft_tree(mnt, parent, mp); if (err) { lock_mount_hash(); umount_tree(mnt, UMOUNT_SYNC); unlock_mount_hash(); } out2: unlock_mount(mp); out: path_put(&old_path); return err; } static struct file *open_detached_copy(struct path *path, bool recursive) { struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true); struct mount *mnt, *p; struct file *file; if (IS_ERR(ns)) return ERR_CAST(ns); namespace_lock(); mnt = __do_loopback(path, recursive); if (IS_ERR(mnt)) { namespace_unlock(); free_mnt_ns(ns); return ERR_CAST(mnt); } lock_mount_hash(); for (p = mnt; p; p = next_mnt(p, mnt)) { p->mnt_ns = ns; ns->mounts++; } ns->root = mnt; list_add_tail(&ns->list, &mnt->mnt_list); mntget(&mnt->mnt); unlock_mount_hash(); namespace_unlock(); mntput(path->mnt); path->mnt = &mnt->mnt; file = dentry_open(path, O_PATH, current_cred()); if (IS_ERR(file)) dissolve_on_fput(path->mnt); else file->f_mode |= FMODE_NEED_UNMOUNT; return file; } SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags) { struct file *file; struct path path; int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; bool detached = flags & OPEN_TREE_CLONE; int error; int fd; BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC); if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC)) return -EINVAL; if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE) return -EINVAL; if (flags & AT_NO_AUTOMOUNT) lookup_flags &= ~LOOKUP_AUTOMOUNT; if (flags & AT_SYMLINK_NOFOLLOW) lookup_flags &= ~LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; if (detached && !may_mount()) return -EPERM; fd = get_unused_fd_flags(flags & O_CLOEXEC); if (fd < 0) return fd; error = user_path_at(dfd, filename, lookup_flags, &path); if (unlikely(error)) { file = ERR_PTR(error); } else { if (detached) file = open_detached_copy(&path, flags & AT_RECURSIVE); else file = dentry_open(&path, O_PATH, current_cred()); path_put(&path); } if (IS_ERR(file)) { put_unused_fd(fd); return PTR_ERR(file); } fd_install(fd, file); return fd; } /* * Don't allow locked mount flags to be cleared. * * No locks need to be held here while testing the various MNT_LOCK * flags because those flags can never be cleared once they are set. */ static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags) { unsigned int fl = mnt->mnt.mnt_flags; if ((fl & MNT_LOCK_READONLY) && !(mnt_flags & MNT_READONLY)) return false; if ((fl & MNT_LOCK_NODEV) && !(mnt_flags & MNT_NODEV)) return false; if ((fl & MNT_LOCK_NOSUID) && !(mnt_flags & MNT_NOSUID)) return false; if ((fl & MNT_LOCK_NOEXEC) && !(mnt_flags & MNT_NOEXEC)) return false; if ((fl & MNT_LOCK_ATIME) && ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) return false; return true; } static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags) { bool readonly_request = (mnt_flags & MNT_READONLY); if (readonly_request == __mnt_is_readonly(&mnt->mnt)) return 0; if (readonly_request) return mnt_make_readonly(mnt); mnt->mnt.mnt_flags &= ~MNT_READONLY; return 0; } static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags) { mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK; mnt->mnt.mnt_flags = mnt_flags; touch_mnt_namespace(mnt->mnt_ns); } static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt) { struct super_block *sb = mnt->mnt_sb; if (!__mnt_is_readonly(mnt) && (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) && (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) { char *buf, *mntpath; buf = (char *)__get_free_page(GFP_KERNEL); if (buf) mntpath = d_path(mountpoint, buf, PAGE_SIZE); else mntpath = ERR_PTR(-ENOMEM); if (IS_ERR(mntpath)) mntpath = "(unknown)"; pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n", sb->s_type->name, is_mounted(mnt) ? "remounted" : "mounted", mntpath, &sb->s_time_max, (unsigned long long)sb->s_time_max); sb->s_iflags |= SB_I_TS_EXPIRY_WARNED; if (buf) free_page((unsigned long)buf); } } /* * Handle reconfiguration of the mountpoint only without alteration of the * superblock it refers to. This is triggered by specifying MS_REMOUNT|MS_BIND * to mount(2). */ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags) { struct super_block *sb = path->mnt->mnt_sb; struct mount *mnt = real_mount(path->mnt); int ret; if (!check_mnt(mnt)) return -EINVAL; if (path->dentry != mnt->mnt.mnt_root) return -EINVAL; if (!can_change_locked_flags(mnt, mnt_flags)) return -EPERM; /* * We're only checking whether the superblock is read-only not * changing it, so only take down_read(&sb->s_umount). */ down_read(&sb->s_umount); lock_mount_hash(); ret = change_mount_ro_state(mnt, mnt_flags); if (ret == 0) set_mount_attributes(mnt, mnt_flags); unlock_mount_hash(); up_read(&sb->s_umount); mnt_warn_timestamp_expiry(path, &mnt->mnt); return ret; } /* * change filesystem flags. dir should be a physical root of filesystem. * If you've mounted a non-root directory somewhere and want to do remount * on it - tough luck. */ static int do_remount(struct path *path, int ms_flags, int sb_flags, int mnt_flags, void *data) { int err; struct super_block *sb = path->mnt->mnt_sb; struct mount *mnt = real_mount(path->mnt); struct fs_context *fc; if (!check_mnt(mnt)) return -EINVAL; if (path->dentry != path->mnt->mnt_root) return -EINVAL; if (!can_change_locked_flags(mnt, mnt_flags)) return -EPERM; fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK); if (IS_ERR(fc)) return PTR_ERR(fc); /* * Indicate to the filesystem that the remount request is coming * from the legacy mount system call. */ fc->oldapi = true; err = parse_monolithic_mount_data(fc, data); if (!err) { down_write(&sb->s_umount); err = -EPERM; if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { err = reconfigure_super(fc); if (!err) { lock_mount_hash(); set_mount_attributes(mnt, mnt_flags); unlock_mount_hash(); } } up_write(&sb->s_umount); } mnt_warn_timestamp_expiry(path, &mnt->mnt); put_fs_context(fc); return err; } static inline int tree_contains_unbindable(struct mount *mnt) { struct mount *p; for (p = mnt; p; p = next_mnt(p, mnt)) { if (IS_MNT_UNBINDABLE(p)) return 1; } return 0; } /* * Check that there aren't references to earlier/same mount namespaces in the * specified subtree. Such references can act as pins for mount namespaces * that aren't checked by the mount-cycle checking code, thereby allowing * cycles to be made. */ static bool check_for_nsfs_mounts(struct mount *subtree) { struct mount *p; bool ret = false; lock_mount_hash(); for (p = subtree; p; p = next_mnt(p, subtree)) if (mnt_ns_loop(p->mnt.mnt_root)) goto out; ret = true; out: unlock_mount_hash(); return ret; } static int do_set_group(struct path *from_path, struct path *to_path) { struct mount *from, *to; int err; from = real_mount(from_path->mnt); to = real_mount(to_path->mnt); namespace_lock(); err = -EINVAL; /* To and From must be mounted */ if (!is_mounted(&from->mnt)) goto out; if (!is_mounted(&to->mnt)) goto out; err = -EPERM; /* We should be allowed to modify mount namespaces of both mounts */ if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN)) goto out; if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN)) goto out; err = -EINVAL; /* To and From paths should be mount roots */ if (from_path->dentry != from_path->mnt->mnt_root) goto out; if (to_path->dentry != to_path->mnt->mnt_root) goto out; /* Setting sharing groups is only allowed across same superblock */ if (from->mnt.mnt_sb != to->mnt.mnt_sb) goto out; /* From mount root should be wider than To mount root */ if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root)) goto out; /* From mount should not have locked children in place of To's root */ if (has_locked_children(from, to->mnt.mnt_root)) goto out; /* Setting sharing groups is only allowed on private mounts */ if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to)) goto out; /* From should not be private */ if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from)) goto out; if (IS_MNT_SLAVE(from)) { struct mount *m = from->mnt_master; list_add(&to->mnt_slave, &m->mnt_slave_list); to->mnt_master = m; } if (IS_MNT_SHARED(from)) { to->mnt_group_id = from->mnt_group_id; list_add(&to->mnt_share, &from->mnt_share); lock_mount_hash(); set_mnt_shared(to); unlock_mount_hash(); } err = 0; out: namespace_unlock(); return err; } static int do_move_mount(struct path *old_path, struct path *new_path) { struct mnt_namespace *ns; struct mount *p; struct mount *old; struct mount *parent; struct mountpoint *mp, *old_mp; int err; bool attached; mp = lock_mount(new_path); if (IS_ERR(mp)) return PTR_ERR(mp); old = real_mount(old_path->mnt); p = real_mount(new_path->mnt); parent = old->mnt_parent; attached = mnt_has_parent(old); old_mp = old->mnt_mp; ns = old->mnt_ns; err = -EINVAL; /* The mountpoint must be in our namespace. */ if (!check_mnt(p)) goto out; /* The thing moved must be mounted... */ if (!is_mounted(&old->mnt)) goto out; /* ... and either ours or the root of anon namespace */ if (!(attached ? check_mnt(old) : is_anon_ns(ns))) goto out; if (old->mnt.mnt_flags & MNT_LOCKED) goto out; if (old_path->dentry != old_path->mnt->mnt_root) goto out; if (d_is_dir(new_path->dentry) != d_is_dir(old_path->dentry)) goto out; /* * Don't move a mount residing in a shared parent. */ if (attached && IS_MNT_SHARED(parent)) goto out; /* * Don't move a mount tree containing unbindable mounts to a destination * mount which is shared. */ if (IS_MNT_SHARED(p) && tree_contains_unbindable(old)) goto out; err = -ELOOP; if (!check_for_nsfs_mounts(old)) goto out; for (; mnt_has_parent(p); p = p->mnt_parent) if (p == old) goto out; err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, attached); if (err) goto out; /* if the mount is moved, it should no longer be expire * automatically */ list_del_init(&old->mnt_expire); if (attached) put_mountpoint(old_mp); out: unlock_mount(mp); if (!err) { if (attached) mntput_no_expire(parent); else free_mnt_ns(ns); } return err; } static int do_move_mount_old(struct path *path, const char *old_name) { struct path old_path; int err; if (!old_name || !*old_name) return -EINVAL; err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); if (err) return err; err = do_move_mount(&old_path, path); path_put(&old_path); return err; } /* * add a mount into a namespace's mount tree */ static int do_add_mount(struct mount *newmnt, struct mountpoint *mp, const struct path *path, int mnt_flags) { struct mount *parent = real_mount(path->mnt); mnt_flags &= ~MNT_INTERNAL_FLAGS; if (unlikely(!check_mnt(parent))) { /* that's acceptable only for automounts done in private ns */ if (!(mnt_flags & MNT_SHRINKABLE)) return -EINVAL; /* ... and for those we'd better have mountpoint still alive */ if (!parent->mnt_ns) return -EINVAL; } /* Refuse the same filesystem on the same mount point */ if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path->mnt->mnt_root == path->dentry) return -EBUSY; if (d_is_symlink(newmnt->mnt.mnt_root)) return -EINVAL; newmnt->mnt.mnt_flags = mnt_flags; return graft_tree(newmnt, parent, mp); } static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags); /* * Create a new mount using a superblock configuration and request it * be added to the namespace tree. */ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, unsigned int mnt_flags) { struct vfsmount *mnt; struct mountpoint *mp; struct super_block *sb = fc->root->d_sb; int error; error = security_sb_kern_mount(sb); if (!error && mount_too_revealing(sb, &mnt_flags)) error = -EPERM; if (unlikely(error)) { fc_drop_locked(fc); return error; } up_write(&sb->s_umount); mnt = vfs_create_mount(fc); if (IS_ERR(mnt)) return PTR_ERR(mnt); mnt_warn_timestamp_expiry(mountpoint, mnt); mp = lock_mount(mountpoint); if (IS_ERR(mp)) { mntput(mnt); return PTR_ERR(mp); } error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags); unlock_mount(mp); if (error < 0) mntput(mnt); return error; } /* * create a new mount for userspace and request it to be added into the * namespace's tree */ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, int mnt_flags, const char *name, void *data) { struct file_system_type *type; struct fs_context *fc; const char *subtype = NULL; int err = 0; if (!fstype) return -EINVAL; type = get_fs_type(fstype); if (!type) return -ENODEV; if (type->fs_flags & FS_HAS_SUBTYPE) { subtype = strchr(fstype, '.'); if (subtype) { subtype++; if (!*subtype) { put_filesystem(type); return -EINVAL; } } } fc = fs_context_for_mount(type, sb_flags); put_filesystem(type); if (IS_ERR(fc)) return PTR_ERR(fc); /* * Indicate to the filesystem that the mount request is coming * from the legacy mount system call. */ fc->oldapi = true; if (subtype) err = vfs_parse_fs_string(fc, "subtype", subtype, strlen(subtype)); if (!err && name) err = vfs_parse_fs_string(fc, "source", name, strlen(name)); if (!err) err = parse_monolithic_mount_data(fc, data); if (!err && !mount_capable(fc)) err = -EPERM; if (!err) err = vfs_get_tree(fc); if (!err) err = do_new_mount_fc(fc, path, mnt_flags); put_fs_context(fc); return err; } int finish_automount(struct vfsmount *m, const struct path *path) { struct dentry *dentry = path->dentry; struct mountpoint *mp; struct mount *mnt; int err; if (!m) return 0; if (IS_ERR(m)) return PTR_ERR(m); mnt = real_mount(m); /* The new mount record should have at least 2 refs to prevent it being * expired before we get a chance to add it */ BUG_ON(mnt_get_count(mnt) < 2); if (m->mnt_sb == path->mnt->mnt_sb && m->mnt_root == dentry) { err = -ELOOP; goto discard; } /* * we don't want to use lock_mount() - in this case finding something * that overmounts our mountpoint to be means "quitely drop what we've * got", not "try to mount it on top". */ inode_lock(dentry->d_inode); namespace_lock(); if (unlikely(cant_mount(dentry))) { err = -ENOENT; goto discard_locked; } rcu_read_lock(); if (unlikely(__lookup_mnt(path->mnt, dentry))) { rcu_read_unlock(); err = 0; goto discard_locked; } rcu_read_unlock(); mp = get_mountpoint(dentry); if (IS_ERR(mp)) { err = PTR_ERR(mp); goto discard_locked; } err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE); unlock_mount(mp); if (unlikely(err)) goto discard; mntput(m); return 0; discard_locked: namespace_unlock(); inode_unlock(dentry->d_inode); discard: /* remove m from any expiration list it may be on */ if (!list_empty(&mnt->mnt_expire)) { namespace_lock(); list_del_init(&mnt->mnt_expire); namespace_unlock(); } mntput(m); mntput(m); return err; } /** * mnt_set_expiry - Put a mount on an expiration list * @mnt: The mount to list. * @expiry_list: The list to add the mount to. */ void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) { namespace_lock(); list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); namespace_unlock(); } EXPORT_SYMBOL(mnt_set_expiry); /* * process a list of expirable mountpoints with the intent of discarding any * mountpoints that aren't in use and haven't been touched since last we came * here */ void mark_mounts_for_expiry(struct list_head *mounts) { struct mount *mnt, *next; LIST_HEAD(graveyard); if (list_empty(mounts)) return; namespace_lock(); lock_mount_hash(); /* extract from the expiration list every vfsmount that matches the * following criteria: * - only referenced by its parent vfsmount * - still marked for expiry (marked on the last call here; marks are * cleared by mntput()) */ list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { if (!xchg(&mnt->mnt_expiry_mark, 1) || propagate_mount_busy(mnt, 1)) continue; list_move(&mnt->mnt_expire, &graveyard); } while (!list_empty(&graveyard)) { mnt = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(mnt->mnt_ns); umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); } unlock_mount_hash(); namespace_unlock(); } EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); /* * Ripoff of 'select_parent()' * * search the list of submounts for a given mountpoint, and move any * shrinkable submounts to the 'graveyard' list. */ static int select_submounts(struct mount *parent, struct list_head *graveyard) { struct mount *this_parent = parent; struct list_head *next; int found = 0; repeat: next = this_parent->mnt_mounts.next; resume: while (next != &this_parent->mnt_mounts) { struct list_head *tmp = next; struct mount *mnt = list_entry(tmp, struct mount, mnt_child); next = tmp->next; if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE)) continue; /* * Descend a level if the d_mounts list is non-empty. */ if (!list_empty(&mnt->mnt_mounts)) { this_parent = mnt; goto repeat; } if (!propagate_mount_busy(mnt, 1)) { list_move_tail(&mnt->mnt_expire, graveyard); found++; } } /* * All done at this level ... ascend and resume the search */ if (this_parent != parent) { next = this_parent->mnt_child.next; this_parent = this_parent->mnt_parent; goto resume; } return found; } /* * process a list of expirable mountpoints with the intent of discarding any * submounts of a specific parent mountpoint * * mount_lock must be held for write */ static void shrink_submounts(struct mount *mnt) { LIST_HEAD(graveyard); struct mount *m; /* extract submounts of 'mountpoint' from the expiration list */ while (select_submounts(mnt, &graveyard)) { while (!list_empty(&graveyard)) { m = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(m->mnt_ns); umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC); } } } static void *copy_mount_options(const void __user * data) { char *copy; unsigned left, offset; if (!data) return NULL; copy = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!copy) return ERR_PTR(-ENOMEM); left = copy_from_user(copy, data, PAGE_SIZE); /* * Not all architectures have an exact copy_from_user(). Resort to * byte at a time. */ offset = PAGE_SIZE - left; while (left) { char c; if (get_user(c, (const char __user *)data + offset)) break; copy[offset] = c; left--; offset++; } if (left == PAGE_SIZE) { kfree(copy); return ERR_PTR(-EFAULT); } return copy; } static char *copy_mount_string(const void __user *data) { return data ? strndup_user(data, PATH_MAX) : NULL; } /* * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to * be given to the mount() call (ie: read-only, no-dev, no-suid etc). * * data is a (void *) that can point to any structure up to * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent * information (or be NULL). * * Pre-0.97 versions of mount() didn't have a flags word. * When the flags word was introduced its top half was required * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. * Therefore, if this magic number is present, it carries no information * and must be discarded. */ int path_mount(const char *dev_name, struct path *path, const char *type_page, unsigned long flags, void *data_page) { unsigned int mnt_flags = 0, sb_flags; int ret; /* Discard magic */ if ((flags & MS_MGC_MSK) == MS_MGC_VAL) flags &= ~MS_MGC_MSK; /* Basic sanity checks */ if (data_page) ((char *)data_page)[PAGE_SIZE - 1] = 0; if (flags & MS_NOUSER) return -EINVAL; ret = security_sb_mount(dev_name, path, type_page, flags, data_page); if (ret) return ret; if (!may_mount()) return -EPERM; if (flags & SB_MANDLOCK) warn_mandlock(); /* Default to relatime unless overriden */ if (!(flags & MS_NOATIME)) mnt_flags |= MNT_RELATIME; /* Separate the per-mountpoint flags */ if (flags & MS_NOSUID) mnt_flags |= MNT_NOSUID; if (flags & MS_NODEV) mnt_flags |= MNT_NODEV; if (flags & MS_NOEXEC) mnt_flags |= MNT_NOEXEC; if (flags & MS_NOATIME) mnt_flags |= MNT_NOATIME; if (flags & MS_NODIRATIME) mnt_flags |= MNT_NODIRATIME; if (flags & MS_STRICTATIME) mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); if (flags & MS_RDONLY) mnt_flags |= MNT_READONLY; if (flags & MS_NOSYMFOLLOW) mnt_flags |= MNT_NOSYMFOLLOW; /* The default atime for remount is preservation */ if ((flags & MS_REMOUNT) && ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME | MS_STRICTATIME)) == 0)) { mnt_flags &= ~MNT_ATIME_MASK; mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK; } sb_flags = flags & (SB_RDONLY | SB_SYNCHRONOUS | SB_MANDLOCK | SB_DIRSYNC | SB_SILENT | SB_POSIXACL | SB_LAZYTIME | SB_I_VERSION); if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND)) return do_reconfigure_mnt(path, mnt_flags); if (flags & MS_REMOUNT) return do_remount(path, flags, sb_flags, mnt_flags, data_page); if (flags & MS_BIND) return do_loopback(path, dev_name, flags & MS_REC); if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) return do_change_type(path, flags); if (flags & MS_MOVE) return do_move_mount_old(path, dev_name); return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name, data_page); } long do_mount(const char *dev_name, const char __user *dir_name, const char *type_page, unsigned long flags, void *data_page) { struct path path; int ret; ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path); if (ret) return ret; ret = path_mount(dev_name, &path, type_page, flags, data_page); path_put(&path); return ret; } static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES); } static void dec_mnt_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES); } static void free_mnt_ns(struct mnt_namespace *ns) { if (!is_anon_ns(ns)) ns_free_inum(&ns->ns); dec_mnt_namespaces(ns->ucounts); put_user_ns(ns->user_ns); kfree(ns); } /* * Assign a sequence number so we can detect when we attempt to bind * mount a reference to an older mount namespace into the current * mount namespace, preventing reference counting loops. A 64bit * number incrementing at 10Ghz will take 12,427 years to wrap which * is effectively never, so we can ignore the possibility. */ static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon) { struct mnt_namespace *new_ns; struct ucounts *ucounts; int ret; ucounts = inc_mnt_namespaces(user_ns); if (!ucounts) return ERR_PTR(-ENOSPC); new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) { dec_mnt_namespaces(ucounts); return ERR_PTR(-ENOMEM); } if (!anon) { ret = ns_alloc_inum(&new_ns->ns); if (ret) { kfree(new_ns); dec_mnt_namespaces(ucounts); return ERR_PTR(ret); } } new_ns->ns.ops = &mntns_operations; if (!anon) new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); refcount_set(&new_ns->ns.count, 1); INIT_LIST_HEAD(&new_ns->list); init_waitqueue_head(&new_ns->poll); spin_lock_init(&new_ns->ns_lock); new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; return new_ns; } __latent_entropy struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; struct mount *p, *q; struct mount *old; struct mount *new; int copy_flags; BUG_ON(!ns); if (likely(!(flags & CLONE_NEWNS))) { get_mnt_ns(ns); return ns; } old = ns->root; new_ns = alloc_mnt_ns(user_ns, false); if (IS_ERR(new_ns)) return new_ns; namespace_lock(); /* First pass: copy the tree topology */ copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; if (user_ns != ns->user_ns) copy_flags |= CL_SHARED_TO_SLAVE; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { namespace_unlock(); free_mnt_ns(new_ns); return ERR_CAST(new); } if (user_ns != ns->user_ns) { lock_mount_hash(); lock_mnt_tree(new); unlock_mount_hash(); } new_ns->root = new; list_add_tail(&new_ns->list, &new->mnt_list); /* * Second pass: switch the tsk->fs->* elements and mark new vfsmounts * as belonging to new namespace. We have already acquired a private * fs_struct, so tsk->fs->lock is not needed. */ p = old; q = new; while (p) { q->mnt_ns = new_ns; new_ns->mounts++; if (new_fs) { if (&p->mnt == new_fs->root.mnt) { new_fs->root.mnt = mntget(&q->mnt); rootmnt = &p->mnt; } if (&p->mnt == new_fs->pwd.mnt) { new_fs->pwd.mnt = mntget(&q->mnt); pwdmnt = &p->mnt; } } p = next_mnt(p, old); q = next_mnt(q, new); if (!q) break; while (p->mnt.mnt_root != q->mnt.mnt_root) p = next_mnt(p, old); } namespace_unlock(); if (rootmnt) mntput(rootmnt); if (pwdmnt) mntput(pwdmnt); return new_ns; } struct dentry *mount_subtree(struct vfsmount *m, const char *name) { struct mount *mnt = real_mount(m); struct mnt_namespace *ns; struct super_block *s; struct path path; int err; ns = alloc_mnt_ns(&init_user_ns, true); if (IS_ERR(ns)) { mntput(m); return ERR_CAST(ns); } mnt->mnt_ns = ns; ns->root = mnt; ns->mounts++; list_add(&mnt->mnt_list, &ns->list); err = vfs_path_lookup(m->mnt_root, m, name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); put_mnt_ns(ns); if (err) return ERR_PTR(err); /* trade a vfsmount reference for active sb one */ s = path.mnt->mnt_sb; atomic_inc(&s->s_active); mntput(path.mnt); /* lock the sucker */ down_write(&s->s_umount); /* ... and return the root of (sub)tree on it */ return path.dentry; } EXPORT_SYMBOL(mount_subtree); SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, char __user *, type, unsigned long, flags, void __user *, data) { int ret; char *kernel_type; char *kernel_dev; void *options; kernel_type = copy_mount_string(type); ret = PTR_ERR(kernel_type); if (IS_ERR(kernel_type)) goto out_type; kernel_dev = copy_mount_string(dev_name); ret = PTR_ERR(kernel_dev); if (IS_ERR(kernel_dev)) goto out_dev; options = copy_mount_options(data); ret = PTR_ERR(options); if (IS_ERR(options)) goto out_data; ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options); kfree(options); out_data: kfree(kernel_dev); out_dev: kfree(kernel_type); out_type: return ret; } #define FSMOUNT_VALID_FLAGS \ (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \ MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME | \ MOUNT_ATTR_NOSYMFOLLOW) #define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP) #define MOUNT_SETATTR_PROPAGATION_FLAGS \ (MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED) static unsigned int attr_flags_to_mnt_flags(u64 attr_flags) { unsigned int mnt_flags = 0; if (attr_flags & MOUNT_ATTR_RDONLY) mnt_flags |= MNT_READONLY; if (attr_flags & MOUNT_ATTR_NOSUID) mnt_flags |= MNT_NOSUID; if (attr_flags & MOUNT_ATTR_NODEV) mnt_flags |= MNT_NODEV; if (attr_flags & MOUNT_ATTR_NOEXEC) mnt_flags |= MNT_NOEXEC; if (attr_flags & MOUNT_ATTR_NODIRATIME) mnt_flags |= MNT_NODIRATIME; if (attr_flags & MOUNT_ATTR_NOSYMFOLLOW) mnt_flags |= MNT_NOSYMFOLLOW; return mnt_flags; } /* * Create a kernel mount representation for a new, prepared superblock * (specified by fs_fd) and attach to an open_tree-like file descriptor. */ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, unsigned int, attr_flags) { struct mnt_namespace *ns; struct fs_context *fc; struct file *file; struct path newmount; struct mount *mnt; struct fd f; unsigned int mnt_flags = 0; long ret; if (!may_mount()) return -EPERM; if ((flags & ~(FSMOUNT_CLOEXEC)) != 0) return -EINVAL; if (attr_flags & ~FSMOUNT_VALID_FLAGS) return -EINVAL; mnt_flags = attr_flags_to_mnt_flags(attr_flags); switch (attr_flags & MOUNT_ATTR__ATIME) { case MOUNT_ATTR_STRICTATIME: break; case MOUNT_ATTR_NOATIME: mnt_flags |= MNT_NOATIME; break; case MOUNT_ATTR_RELATIME: mnt_flags |= MNT_RELATIME; break; default: return -EINVAL; } f = fdget(fs_fd); if (!f.file) return -EBADF; ret = -EINVAL; if (f.file->f_op != &fscontext_fops) goto err_fsfd; fc = f.file->private_data; ret = mutex_lock_interruptible(&fc->uapi_mutex); if (ret < 0) goto err_fsfd; /* There must be a valid superblock or we can't mount it */ ret = -EINVAL; if (!fc->root) goto err_unlock; ret = -EPERM; if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) { pr_warn("VFS: Mount too revealing\n"); goto err_unlock; } ret = -EBUSY; if (fc->phase != FS_CONTEXT_AWAITING_MOUNT) goto err_unlock; if (fc->sb_flags & SB_MANDLOCK) warn_mandlock(); newmount.mnt = vfs_create_mount(fc); if (IS_ERR(newmount.mnt)) { ret = PTR_ERR(newmount.mnt); goto err_unlock; } newmount.dentry = dget(fc->root); newmount.mnt->mnt_flags = mnt_flags; /* We've done the mount bit - now move the file context into more or * less the same state as if we'd done an fspick(). We don't want to * do any memory allocation or anything like that at this point as we * don't want to have to handle any errors incurred. */ vfs_clean_context(fc); ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true); if (IS_ERR(ns)) { ret = PTR_ERR(ns); goto err_path; } mnt = real_mount(newmount.mnt); mnt->mnt_ns = ns; ns->root = mnt; ns->mounts = 1; list_add(&mnt->mnt_list, &ns->list); mntget(newmount.mnt); /* Attach to an apparent O_PATH fd with a note that we need to unmount * it, not just simply put it. */ file = dentry_open(&newmount, O_PATH, fc->cred); if (IS_ERR(file)) { dissolve_on_fput(newmount.mnt); ret = PTR_ERR(file); goto err_path; } file->f_mode |= FMODE_NEED_UNMOUNT; ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0); if (ret >= 0) fd_install(ret, file); else fput(file); err_path: path_put(&newmount); err_unlock: mutex_unlock(&fc->uapi_mutex); err_fsfd: fdput(f); return ret; } /* * Move a mount from one place to another. In combination with * fsopen()/fsmount() this is used to install a new mount and in combination * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy * a mount subtree. * * Note the flags value is a combination of MOVE_MOUNT_* flags. */ SYSCALL_DEFINE5(move_mount, int, from_dfd, const char __user *, from_pathname, int, to_dfd, const char __user *, to_pathname, unsigned int, flags) { struct path from_path, to_path; unsigned int lflags; int ret = 0; if (!may_mount()) return -EPERM; if (flags & ~MOVE_MOUNT__MASK) return -EINVAL; /* If someone gives a pathname, they aren't permitted to move * from an fd that requires unmount as we can't get at the flag * to clear it afterwards. */ lflags = 0; if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW; if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; if (flags & MOVE_MOUNT_F_EMPTY_PATH) lflags |= LOOKUP_EMPTY; ret = user_path_at(from_dfd, from_pathname, lflags, &from_path); if (ret < 0) return ret; lflags = 0; if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW; if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; if (flags & MOVE_MOUNT_T_EMPTY_PATH) lflags |= LOOKUP_EMPTY; ret = user_path_at(to_dfd, to_pathname, lflags, &to_path); if (ret < 0) goto out_from; ret = security_move_mount(&from_path, &to_path); if (ret < 0) goto out_to; if (flags & MOVE_MOUNT_SET_GROUP) ret = do_set_group(&from_path, &to_path); else ret = do_move_mount(&from_path, &to_path); out_to: path_put(&to_path); out_from: path_put(&from_path); return ret; } /* * Return true if path is reachable from root * * namespace_sem or mount_lock is held */ bool is_path_reachable(struct mount *mnt, struct dentry *dentry, const struct path *root) { while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) { dentry = mnt->mnt_mountpoint; mnt = mnt->mnt_parent; } return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry); } bool path_is_under(const struct path *path1, const struct path *path2) { bool res; read_seqlock_excl(&mount_lock); res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); read_sequnlock_excl(&mount_lock); return res; } EXPORT_SYMBOL(path_is_under); /* * pivot_root Semantics: * Moves the root file system of the current process to the directory put_old, * makes new_root as the new root file system of the current process, and sets * root/cwd of all processes which had them on the current root to new_root. * * Restrictions: * The new_root and put_old must be directories, and must not be on the * same file system as the current process root. The put_old must be * underneath new_root, i.e. adding a non-zero number of /.. to the string * pointed to by put_old must yield the same directory as new_root. No other * file system may be mounted on put_old. After all, new_root is a mountpoint. * * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives * in this situation. * * Notes: * - we don't move root/cwd if they are not at the root (reason: if something * cared enough to change them, it's probably wrong to force them elsewhere) * - it's okay to pick a root that isn't the root of a file system, e.g. * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root * first. */ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, const char __user *, put_old) { struct path new, old, root; struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent; struct mountpoint *old_mp, *root_mp; int error; if (!may_mount()) return -EPERM; error = user_path_at(AT_FDCWD, new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new); if (error) goto out0; error = user_path_at(AT_FDCWD, put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old); if (error) goto out1; error = security_sb_pivotroot(&old, &new); if (error) goto out2; get_fs_root(current->fs, &root); old_mp = lock_mount(&old); error = PTR_ERR(old_mp); if (IS_ERR(old_mp)) goto out3; error = -EINVAL; new_mnt = real_mount(new.mnt); root_mnt = real_mount(root.mnt); old_mnt = real_mount(old.mnt); ex_parent = new_mnt->mnt_parent; root_parent = root_mnt->mnt_parent; if (IS_MNT_SHARED(old_mnt) || IS_MNT_SHARED(ex_parent) || IS_MNT_SHARED(root_parent)) goto out4; if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) goto out4; if (new_mnt->mnt.mnt_flags & MNT_LOCKED) goto out4; error = -ENOENT; if (d_unlinked(new.dentry)) goto out4; error = -EBUSY; if (new_mnt == root_mnt || old_mnt == root_mnt) goto out4; /* loop, on the same file system */ error = -EINVAL; if (root.mnt->mnt_root != root.dentry) goto out4; /* not a mountpoint */ if (!mnt_has_parent(root_mnt)) goto out4; /* not attached */ if (new.mnt->mnt_root != new.dentry) goto out4; /* not a mountpoint */ if (!mnt_has_parent(new_mnt)) goto out4; /* not attached */ /* make sure we can reach put_old from new_root */ if (!is_path_reachable(old_mnt, old.dentry, &new)) goto out4; /* make certain new is below the root */ if (!is_path_reachable(new_mnt, new.dentry, &root)) goto out4; lock_mount_hash(); umount_mnt(new_mnt); root_mp = unhash_mnt(root_mnt); /* we'll need its mountpoint */ if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { new_mnt->mnt.mnt_flags |= MNT_LOCKED; root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; } /* mount old root on put_old */ attach_mnt(root_mnt, old_mnt, old_mp); /* mount new_root on / */ attach_mnt(new_mnt, root_parent, root_mp); mnt_add_count(root_parent, -1); touch_mnt_namespace(current->nsproxy->mnt_ns); /* A moved mount should not expire automatically */ list_del_init(&new_mnt->mnt_expire); put_mountpoint(root_mp); unlock_mount_hash(); chroot_fs_refs(&root, &new); error = 0; out4: unlock_mount(old_mp); if (!error) mntput_no_expire(ex_parent); out3: path_put(&root); out2: path_put(&old); out1: path_put(&new); out0: return error; } static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt) { unsigned int flags = mnt->mnt.mnt_flags; /* flags to clear */ flags &= ~kattr->attr_clr; /* flags to raise */ flags |= kattr->attr_set; return flags; } static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) { struct vfsmount *m = &mnt->mnt; struct user_namespace *fs_userns = m->mnt_sb->s_user_ns; if (!kattr->mnt_userns) return 0; /* * Creating an idmapped mount with the filesystem wide idmapping * doesn't make sense so block that. We don't allow mushy semantics. */ if (kattr->mnt_userns == fs_userns) return -EINVAL; /* * Once a mount has been idmapped we don't allow it to change its * mapping. It makes things simpler and callers can just create * another bind-mount they can idmap if they want to. */ if (is_idmapped_mnt(m)) return -EPERM; /* The underlying filesystem doesn't support idmapped mounts yet. */ if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP)) return -EINVAL; /* We're not controlling the superblock. */ if (!ns_capable(fs_userns, CAP_SYS_ADMIN)) return -EPERM; /* Mount has already been visible in the filesystem hierarchy. */ if (!is_anon_ns(mnt->mnt_ns)) return -EINVAL; return 0; } /** * mnt_allow_writers() - check whether the attribute change allows writers * @kattr: the new mount attributes * @mnt: the mount to which @kattr will be applied * * Check whether thew new mount attributes in @kattr allow concurrent writers. * * Return: true if writers need to be held, false if not */ static inline bool mnt_allow_writers(const struct mount_kattr *kattr, const struct mount *mnt) { return (!(kattr->attr_set & MNT_READONLY) || (mnt->mnt.mnt_flags & MNT_READONLY)) && !kattr->mnt_userns; } static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) { struct mount *m; int err; for (m = mnt; m; m = next_mnt(m, mnt)) { if (!can_change_locked_flags(m, recalc_flags(kattr, m))) { err = -EPERM; break; } err = can_idmap_mount(kattr, m); if (err) break; if (!mnt_allow_writers(kattr, m)) { err = mnt_hold_writers(m); if (err) break; } if (!kattr->recurse) return 0; } if (err) { struct mount *p; /* * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all * mounts and needs to take care to include the first mount. */ for (p = mnt; p; p = next_mnt(p, mnt)) { /* If we had to hold writers unblock them. */ if (p->mnt.mnt_flags & MNT_WRITE_HOLD) mnt_unhold_writers(p); /* * We're done once the first mount we changed got * MNT_WRITE_HOLD unset. */ if (p == m) break; } } return err; } static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) { struct user_namespace *mnt_userns, *old_mnt_userns; if (!kattr->mnt_userns) return; /* * We're the only ones able to change the mount's idmapping. So * mnt->mnt.mnt_userns is stable and we can retrieve it directly. */ old_mnt_userns = mnt->mnt.mnt_userns; mnt_userns = get_user_ns(kattr->mnt_userns); /* Pairs with smp_load_acquire() in mnt_user_ns(). */ smp_store_release(&mnt->mnt.mnt_userns, mnt_userns); /* * If this is an idmapped filesystem drop the reference we've taken * in vfs_create_mount() before. */ if (!initial_idmapping(old_mnt_userns)) put_user_ns(old_mnt_userns); } static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) { struct mount *m; for (m = mnt; m; m = next_mnt(m, mnt)) { unsigned int flags; do_idmap_mount(kattr, m); flags = recalc_flags(kattr, m); WRITE_ONCE(m->mnt.mnt_flags, flags); /* If we had to hold writers unblock them. */ if (m->mnt.mnt_flags & MNT_WRITE_HOLD) mnt_unhold_writers(m); if (kattr->propagation) change_mnt_propagation(m, kattr->propagation); if (!kattr->recurse) break; } touch_mnt_namespace(mnt->mnt_ns); } static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) { struct mount *mnt = real_mount(path->mnt); int err = 0; if (path->dentry != mnt->mnt.mnt_root) return -EINVAL; if (kattr->propagation) { /* * Only take namespace_lock() if we're actually changing * propagation. */ namespace_lock(); if (kattr->propagation == MS_SHARED) { err = invent_group_ids(mnt, kattr->recurse); if (err) { namespace_unlock(); return err; } } } err = -EINVAL; lock_mount_hash(); /* Ensure that this isn't anything purely vfs internal. */ if (!is_mounted(&mnt->mnt)) goto out; /* * If this is an attached mount make sure it's located in the callers * mount namespace. If it's not don't let the caller interact with it. * * If this mount doesn't have a parent it's most often simply a * detached mount with an anonymous mount namespace. IOW, something * that's simply not attached yet. But there are apparently also users * that do change mount properties on the rootfs itself. That obviously * neither has a parent nor is it a detached mount so we cannot * unconditionally check for detached mounts. */ if ((mnt_has_parent(mnt) || !is_anon_ns(mnt->mnt_ns)) && !check_mnt(mnt)) goto out; /* * First, we get the mount tree in a shape where we can change mount * properties without failure. If we succeeded to do so we commit all * changes and if we failed we clean up. */ err = mount_setattr_prepare(kattr, mnt); if (!err) mount_setattr_commit(kattr, mnt); out: unlock_mount_hash(); if (kattr->propagation) { if (err) cleanup_group_ids(mnt, NULL); namespace_unlock(); } return err; } static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, struct mount_kattr *kattr, unsigned int flags) { int err = 0; struct ns_common *ns; struct user_namespace *mnt_userns; struct file *file; if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP)) return 0; /* * We currently do not support clearing an idmapped mount. If this ever * is a use-case we can revisit this but for now let's keep it simple * and not allow it. */ if (attr->attr_clr & MOUNT_ATTR_IDMAP) return -EINVAL; if (attr->userns_fd > INT_MAX) return -EINVAL; file = fget(attr->userns_fd); if (!file) return -EBADF; if (!proc_ns_file(file)) { err = -EINVAL; goto out_fput; } ns = get_proc_ns(file_inode(file)); if (ns->ops->type != CLONE_NEWUSER) { err = -EINVAL; goto out_fput; } /* * The initial idmapping cannot be used to create an idmapped * mount. We use the initial idmapping as an indicator of a mount * that is not idmapped. It can simply be passed into helpers that * are aware of idmapped mounts as a convenient shortcut. A user * can just create a dedicated identity mapping to achieve the same * result. */ mnt_userns = container_of(ns, struct user_namespace, ns); if (initial_idmapping(mnt_userns)) { err = -EPERM; goto out_fput; } /* We're not controlling the target namespace. */ if (!ns_capable(mnt_userns, CAP_SYS_ADMIN)) { err = -EPERM; goto out_fput; } kattr->mnt_userns = get_user_ns(mnt_userns); out_fput: fput(file); return err; } static int build_mount_kattr(const struct mount_attr *attr, size_t usize, struct mount_kattr *kattr, unsigned int flags) { unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; if (flags & AT_NO_AUTOMOUNT) lookup_flags &= ~LOOKUP_AUTOMOUNT; if (flags & AT_SYMLINK_NOFOLLOW) lookup_flags &= ~LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; *kattr = (struct mount_kattr) { .lookup_flags = lookup_flags, .recurse = !!(flags & AT_RECURSIVE), }; if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS) return -EINVAL; if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1) return -EINVAL; kattr->propagation = attr->propagation; if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS) return -EINVAL; kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set); kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr); /* * Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap, * users wanting to transition to a different atime setting cannot * simply specify the atime setting in @attr_set, but must also * specify MOUNT_ATTR__ATIME in the @attr_clr field. * So ensure that MOUNT_ATTR__ATIME can't be partially set in * @attr_clr and that @attr_set can't have any atime bits set if * MOUNT_ATTR__ATIME isn't set in @attr_clr. */ if (attr->attr_clr & MOUNT_ATTR__ATIME) { if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME) return -EINVAL; /* * Clear all previous time settings as they are mutually * exclusive. */ kattr->attr_clr |= MNT_RELATIME | MNT_NOATIME; switch (attr->attr_set & MOUNT_ATTR__ATIME) { case MOUNT_ATTR_RELATIME: kattr->attr_set |= MNT_RELATIME; break; case MOUNT_ATTR_NOATIME: kattr->attr_set |= MNT_NOATIME; break; case MOUNT_ATTR_STRICTATIME: break; default: return -EINVAL; } } else { if (attr->attr_set & MOUNT_ATTR__ATIME) return -EINVAL; } return build_mount_idmapped(attr, usize, kattr, flags); } static void finish_mount_kattr(struct mount_kattr *kattr) { put_user_ns(kattr->mnt_userns); kattr->mnt_userns = NULL; } SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, unsigned int, flags, struct mount_attr __user *, uattr, size_t, usize) { int err; struct path target; struct mount_attr attr; struct mount_kattr kattr; BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0); if (flags & ~(AT_EMPTY_PATH | AT_RECURSIVE | AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) return -EINVAL; if (unlikely(usize > PAGE_SIZE)) return -E2BIG; if (unlikely(usize < MOUNT_ATTR_SIZE_VER0)) return -EINVAL; if (!may_mount()) return -EPERM; err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize); if (err) return err; /* Don't bother walking through the mounts if this is a nop. */ if (attr.attr_set == 0 && attr.attr_clr == 0 && attr.propagation == 0) return 0; err = build_mount_kattr(&attr, usize, &kattr, flags); if (err) return err; err = user_path_at(dfd, path, kattr.lookup_flags, &target); if (!err) { err = do_mount_setattr(&target, &kattr); path_put(&target); } finish_mount_kattr(&kattr); return err; } static void __init init_mount_tree(void) { struct vfsmount *mnt; struct mount *m; struct mnt_namespace *ns; struct path root; mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL); if (IS_ERR(mnt)) panic("Can't create rootfs"); ns = alloc_mnt_ns(&init_user_ns, false); if (IS_ERR(ns)) panic("Can't allocate initial namespace"); m = real_mount(mnt); m->mnt_ns = ns; ns->root = m; ns->mounts = 1; list_add(&m->mnt_list, &ns->list); init_task.nsproxy->mnt_ns = ns; get_mnt_ns(ns); root.mnt = mnt; root.dentry = mnt->mnt_root; mnt->mnt_flags |= MNT_LOCKED; set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); } void __init mnt_init(void) { int err; mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); mount_hashtable = alloc_large_system_hash("Mount-cache", sizeof(struct hlist_head), mhash_entries, 19, HASH_ZERO, &m_hash_shift, &m_hash_mask, 0, 0); mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache", sizeof(struct hlist_head), mphash_entries, 19, HASH_ZERO, &mp_hash_shift, &mp_hash_mask, 0, 0); if (!mount_hashtable || !mountpoint_hashtable) panic("Failed to allocate mount hash table\n"); kernfs_init(); err = sysfs_init(); if (err) printk(KERN_WARNING "%s: sysfs_init error: %d\n", __func__, err); fs_kobj = kobject_create_and_add("fs", NULL); if (!fs_kobj) printk(KERN_WARNING "%s: kobj create error\n", __func__); shmem_init(); init_rootfs(); init_mount_tree(); } void put_mnt_ns(struct mnt_namespace *ns) { if (!refcount_dec_and_test(&ns->ns.count)) return; drop_collected_mounts(&ns->root->mnt); free_mnt_ns(ns); } struct vfsmount *kern_mount(struct file_system_type *type) { struct vfsmount *mnt; mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL); if (!IS_ERR(mnt)) { /* * it is a longterm mount, don't release mnt until * we unmount before file sys is unregistered */ real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; } return mnt; } EXPORT_SYMBOL_GPL(kern_mount); void kern_unmount(struct vfsmount *mnt) { /* release long term mount so mount point can be released */ if (!IS_ERR_OR_NULL(mnt)) { real_mount(mnt)->mnt_ns = NULL; synchronize_rcu(); /* yecchhh... */ mntput(mnt); } } EXPORT_SYMBOL(kern_unmount); void kern_unmount_array(struct vfsmount *mnt[], unsigned int num) { unsigned int i; for (i = 0; i < num; i++) if (mnt[i]) real_mount(mnt[i])->mnt_ns = NULL; synchronize_rcu_expedited(); for (i = 0; i < num; i++) mntput(mnt[i]); } EXPORT_SYMBOL(kern_unmount_array); bool our_mnt(struct vfsmount *mnt) { return check_mnt(real_mount(mnt)); } bool current_chrooted(void) { /* Does the current process have a non-standard root */ struct path ns_root; struct path fs_root; bool chrooted; /* Find the namespace root */ ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt; ns_root.dentry = ns_root.mnt->mnt_root; path_get(&ns_root); while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) ; get_fs_root(current->fs, &fs_root); chrooted = !path_equal(&fs_root, &ns_root); path_put(&fs_root); path_put(&ns_root); return chrooted; } static bool mnt_already_visible(struct mnt_namespace *ns, const struct super_block *sb, int *new_mnt_flags) { int new_flags = *new_mnt_flags; struct mount *mnt; bool visible = false; down_read(&namespace_sem); lock_ns_list(ns); list_for_each_entry(mnt, &ns->list, mnt_list) { struct mount *child; int mnt_flags; if (mnt_is_cursor(mnt)) continue; if (mnt->mnt.mnt_sb->s_type != sb->s_type) continue; /* This mount is not fully visible if it's root directory * is not the root directory of the filesystem. */ if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root) continue; /* A local view of the mount flags */ mnt_flags = mnt->mnt.mnt_flags; /* Don't miss readonly hidden in the superblock flags */ if (sb_rdonly(mnt->mnt.mnt_sb)) mnt_flags |= MNT_LOCK_READONLY; /* Verify the mount flags are equal to or more permissive * than the proposed new mount. */ if ((mnt_flags & MNT_LOCK_READONLY) && !(new_flags & MNT_READONLY)) continue; if ((mnt_flags & MNT_LOCK_ATIME) && ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK))) continue; /* This mount is not fully visible if there are any * locked child mounts that cover anything except for * empty directories. */ list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { struct inode *inode = child->mnt_mountpoint->d_inode; /* Only worry about locked mounts */ if (!(child->mnt.mnt_flags & MNT_LOCKED)) continue; /* Is the directory permanetly empty? */ if (!is_empty_dir_inode(inode)) goto next; } /* Preserve the locked attributes */ *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \ MNT_LOCK_ATIME); visible = true; goto found; next: ; } found: unlock_ns_list(ns); up_read(&namespace_sem); return visible; } static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags) { const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV; struct mnt_namespace *ns = current->nsproxy->mnt_ns; unsigned long s_iflags; if (ns->user_ns == &init_user_ns) return false; /* Can this filesystem be too revealing? */ s_iflags = sb->s_iflags; if (!(s_iflags & SB_I_USERNS_VISIBLE)) return false; if ((s_iflags & required_iflags) != required_iflags) { WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n", required_iflags); return true; } return !mnt_already_visible(ns, sb, new_mnt_flags); } bool mnt_may_suid(struct vfsmount *mnt) { /* * Foreign mounts (accessed via fchdir or through /proc * symlinks) are always treated as if they are nosuid. This * prevents namespaces from trusting potentially unsafe * suid/sgid bits, file caps, or security labels that originate * in other namespaces. */ return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) && current_in_userns(mnt->mnt_sb->s_user_ns); } static struct ns_common *mntns_get(struct task_struct *task) { struct ns_common *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) { ns = &nsproxy->mnt_ns->ns; get_mnt_ns(to_mnt_ns(ns)); } task_unlock(task); return ns; } static void mntns_put(struct ns_common *ns) { put_mnt_ns(to_mnt_ns(ns)); } static int mntns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct fs_struct *fs = nsset->fs; struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns; struct user_namespace *user_ns = nsset->cred->user_ns; struct path root; int err; if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || !ns_capable(user_ns, CAP_SYS_CHROOT) || !ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; if (is_anon_ns(mnt_ns)) return -EINVAL; if (fs->users != 1) return -EINVAL; get_mnt_ns(mnt_ns); old_mnt_ns = nsproxy->mnt_ns; nsproxy->mnt_ns = mnt_ns; /* Find the root */ err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt, "/", LOOKUP_DOWN, &root); if (err) { /* revert to old namespace */ nsproxy->mnt_ns = old_mnt_ns; put_mnt_ns(mnt_ns); return err; } put_mnt_ns(old_mnt_ns); /* Update the pwd and root */ set_fs_pwd(fs, &root); set_fs_root(fs, &root); path_put(&root); return 0; } static struct user_namespace *mntns_owner(struct ns_common *ns) { return to_mnt_ns(ns)->user_ns; } const struct proc_ns_operations mntns_operations = { .name = "mnt", .type = CLONE_NEWNS, .get = mntns_get, .put = mntns_put, .install = mntns_install, .owner = mntns_owner, }; #ifdef CONFIG_SYSCTL static struct ctl_table fs_namespace_sysctls[] = { { .procname = "mount-max", .data = &sysctl_mount_max, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, { } }; static int __init init_fs_namespace_sysctls(void) { register_sysctl_init("fs", fs_namespace_sysctls); return 0; } fs_initcall(init_fs_namespace_sysctls); #endif /* CONFIG_SYSCTL */ |
403 372 372 372 3 369 372 2 7 139 309 310 310 310 166 166 166 166 293 292 293 276 64 5 293 293 4 293 4 176 175 176 14 175 176 41 165 62 991 991 991 990 56 56 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/list_bl.h> #include <linux/module.h> #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/mbcache.h> /* * Mbcache is a simple key-value store. Keys need not be unique, however * key-value pairs are expected to be unique (we use this fact in * mb_cache_entry_delete_or_get()). * * Ext2 and ext4 use this cache for deduplication of extended attribute blocks. * Ext4 also uses it for deduplication of xattr values stored in inodes. * They use hash of data as a key and provide a value that may represent a * block or inode number. That's why keys need not be unique (hash of different * data may be the same). However user provided value always uniquely * identifies a cache entry. * * We provide functions for creation and removal of entries, search by key, * and a special "delete entry with given key-value pair" operation. Fixed * size hash table is used for fast key lookups. */ struct mb_cache { /* Hash table of entries */ struct hlist_bl_head *c_hash; /* log2 of hash table size */ int c_bucket_bits; /* Maximum entries in cache to avoid degrading hash too much */ unsigned long c_max_entries; /* Protects c_list, c_entry_count */ spinlock_t c_list_lock; struct list_head c_list; /* Number of entries in cache */ unsigned long c_entry_count; struct shrinker c_shrink; /* Work for shrinking when the cache has too many entries */ struct work_struct c_shrink_work; }; static struct kmem_cache *mb_entry_cache; static unsigned long mb_cache_shrink(struct mb_cache *cache, unsigned long nr_to_scan); static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache, u32 key) { return &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; } /* * Number of entries to reclaim synchronously when there are too many entries * in cache */ #define SYNC_SHRINK_BATCH 64 /* * mb_cache_entry_create - create entry in cache * @cache - cache where the entry should be created * @mask - gfp mask with which the entry should be allocated * @key - key of the entry * @value - value of the entry * @reusable - is the entry reusable by others? * * Creates entry in @cache with key @key and value @value. The function returns * -EBUSY if entry with the same key and value already exists in cache. * Otherwise 0 is returned. */ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, u64 value, bool reusable) { struct mb_cache_entry *entry, *dup; struct hlist_bl_node *dup_node; struct hlist_bl_head *head; /* Schedule background reclaim if there are too many entries */ if (cache->c_entry_count >= cache->c_max_entries) schedule_work(&cache->c_shrink_work); /* Do some sync reclaim if background reclaim cannot keep up */ if (cache->c_entry_count >= 2*cache->c_max_entries) mb_cache_shrink(cache, SYNC_SHRINK_BATCH); entry = kmem_cache_alloc(mb_entry_cache, mask); if (!entry) return -ENOMEM; INIT_LIST_HEAD(&entry->e_list); /* * We create entry with two references. One reference is kept by the * hash table, the other reference is used to protect us from * mb_cache_entry_delete_or_get() until the entry is fully setup. This * avoids nesting of cache->c_list_lock into hash table bit locks which * is problematic for RT. */ atomic_set(&entry->e_refcnt, 2); entry->e_key = key; entry->e_value = value; entry->e_flags = 0; if (reusable) set_bit(MBE_REUSABLE_B, &entry->e_flags); head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { if (dup->e_key == key && dup->e_value == value) { hlist_bl_unlock(head); kmem_cache_free(mb_entry_cache, entry); return -EBUSY; } } hlist_bl_add_head(&entry->e_hash_list, head); hlist_bl_unlock(head); spin_lock(&cache->c_list_lock); list_add_tail(&entry->e_list, &cache->c_list); cache->c_entry_count++; spin_unlock(&cache->c_list_lock); mb_cache_entry_put(cache, entry); return 0; } EXPORT_SYMBOL(mb_cache_entry_create); void __mb_cache_entry_free(struct mb_cache *cache, struct mb_cache_entry *entry) { struct hlist_bl_head *head; head = mb_cache_entry_head(cache, entry->e_key); hlist_bl_lock(head); hlist_bl_del(&entry->e_hash_list); hlist_bl_unlock(head); kmem_cache_free(mb_entry_cache, entry); } EXPORT_SYMBOL(__mb_cache_entry_free); /* * mb_cache_entry_wait_unused - wait to be the last user of the entry * * @entry - entry to work on * * Wait to be the last user of the entry. */ void mb_cache_entry_wait_unused(struct mb_cache_entry *entry) { wait_var_event(&entry->e_refcnt, atomic_read(&entry->e_refcnt) <= 2); } EXPORT_SYMBOL(mb_cache_entry_wait_unused); static struct mb_cache_entry *__entry_find(struct mb_cache *cache, struct mb_cache_entry *entry, u32 key) { struct mb_cache_entry *old_entry = entry; struct hlist_bl_node *node; struct hlist_bl_head *head; head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); if (entry && !hlist_bl_unhashed(&entry->e_hash_list)) node = entry->e_hash_list.next; else node = hlist_bl_first(head); while (node) { entry = hlist_bl_entry(node, struct mb_cache_entry, e_hash_list); if (entry->e_key == key && test_bit(MBE_REUSABLE_B, &entry->e_flags) && atomic_inc_not_zero(&entry->e_refcnt)) goto out; node = node->next; } entry = NULL; out: hlist_bl_unlock(head); if (old_entry) mb_cache_entry_put(cache, old_entry); return entry; } /* * mb_cache_entry_find_first - find the first reusable entry with the given key * @cache: cache where we should search * @key: key to look for * * Search in @cache for a reusable entry with key @key. Grabs reference to the * first reusable entry found and returns the entry. */ struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, u32 key) { return __entry_find(cache, NULL, key); } EXPORT_SYMBOL(mb_cache_entry_find_first); /* * mb_cache_entry_find_next - find next reusable entry with the same key * @cache: cache where we should search * @entry: entry to start search from * * Finds next reusable entry in the hash chain which has the same key as @entry. * If @entry is unhashed (which can happen when deletion of entry races with the * search), finds the first reusable entry in the hash chain. The function drops * reference to @entry and returns with a reference to the found entry. */ struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, struct mb_cache_entry *entry) { return __entry_find(cache, entry, entry->e_key); } EXPORT_SYMBOL(mb_cache_entry_find_next); /* * mb_cache_entry_get - get a cache entry by value (and key) * @cache - cache we work with * @key - key * @value - value */ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key, u64 value) { struct hlist_bl_node *node; struct hlist_bl_head *head; struct mb_cache_entry *entry; head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(entry, node, head, e_hash_list) { if (entry->e_key == key && entry->e_value == value && atomic_inc_not_zero(&entry->e_refcnt)) goto out; } entry = NULL; out: hlist_bl_unlock(head); return entry; } EXPORT_SYMBOL(mb_cache_entry_get); /* mb_cache_entry_delete_or_get - remove a cache entry if it has no users * @cache - cache we work with * @key - key * @value - value * * Remove entry from cache @cache with key @key and value @value. The removal * happens only if the entry is unused. The function returns NULL in case the * entry was successfully removed or there's no entry in cache. Otherwise the * function grabs reference of the entry that we failed to delete because it * still has users and return it. */ struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache, u32 key, u64 value) { struct mb_cache_entry *entry; entry = mb_cache_entry_get(cache, key, value); if (!entry) return NULL; /* * Drop the ref we got from mb_cache_entry_get() and the initial hash * ref if we are the last user */ if (atomic_cmpxchg(&entry->e_refcnt, 2, 0) != 2) return entry; spin_lock(&cache->c_list_lock); if (!list_empty(&entry->e_list)) list_del_init(&entry->e_list); cache->c_entry_count--; spin_unlock(&cache->c_list_lock); __mb_cache_entry_free(cache, entry); return NULL; } EXPORT_SYMBOL(mb_cache_entry_delete_or_get); /* mb_cache_entry_touch - cache entry got used * @cache - cache the entry belongs to * @entry - entry that got used * * Marks entry as used to give hit higher chances of surviving in cache. */ void mb_cache_entry_touch(struct mb_cache *cache, struct mb_cache_entry *entry) { set_bit(MBE_REFERENCED_B, &entry->e_flags); } EXPORT_SYMBOL(mb_cache_entry_touch); static unsigned long mb_cache_count(struct shrinker *shrink, struct shrink_control *sc) { struct mb_cache *cache = container_of(shrink, struct mb_cache, c_shrink); return cache->c_entry_count; } /* Shrink number of entries in cache */ static unsigned long mb_cache_shrink(struct mb_cache *cache, unsigned long nr_to_scan) { struct mb_cache_entry *entry; unsigned long shrunk = 0; spin_lock(&cache->c_list_lock); while (nr_to_scan-- && !list_empty(&cache->c_list)) { entry = list_first_entry(&cache->c_list, struct mb_cache_entry, e_list); /* Drop initial hash reference if there is no user */ if (test_bit(MBE_REFERENCED_B, &entry->e_flags) || atomic_cmpxchg(&entry->e_refcnt, 1, 0) != 1) { clear_bit(MBE_REFERENCED_B, &entry->e_flags); list_move_tail(&entry->e_list, &cache->c_list); continue; } list_del_init(&entry->e_list); cache->c_entry_count--; spin_unlock(&cache->c_list_lock); __mb_cache_entry_free(cache, entry); shrunk++; cond_resched(); spin_lock(&cache->c_list_lock); } spin_unlock(&cache->c_list_lock); return shrunk; } static unsigned long mb_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { struct mb_cache *cache = container_of(shrink, struct mb_cache, c_shrink); return mb_cache_shrink(cache, sc->nr_to_scan); } /* We shrink 1/X of the cache when we have too many entries in it */ #define SHRINK_DIVISOR 16 static void mb_cache_shrink_worker(struct work_struct *work) { struct mb_cache *cache = container_of(work, struct mb_cache, c_shrink_work); mb_cache_shrink(cache, cache->c_max_entries / SHRINK_DIVISOR); } /* * mb_cache_create - create cache * @bucket_bits: log2 of the hash table size * * Create cache for keys with 2^bucket_bits hash entries. */ struct mb_cache *mb_cache_create(int bucket_bits) { struct mb_cache *cache; unsigned long bucket_count = 1UL << bucket_bits; unsigned long i; cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL); if (!cache) goto err_out; cache->c_bucket_bits = bucket_bits; cache->c_max_entries = bucket_count << 4; INIT_LIST_HEAD(&cache->c_list); spin_lock_init(&cache->c_list_lock); cache->c_hash = kmalloc_array(bucket_count, sizeof(struct hlist_bl_head), GFP_KERNEL); if (!cache->c_hash) { kfree(cache); goto err_out; } for (i = 0; i < bucket_count; i++) INIT_HLIST_BL_HEAD(&cache->c_hash[i]); cache->c_shrink.count_objects = mb_cache_count; cache->c_shrink.scan_objects = mb_cache_scan; cache->c_shrink.seeks = DEFAULT_SEEKS; if (register_shrinker(&cache->c_shrink, "mbcache-shrinker")) { kfree(cache->c_hash); kfree(cache); goto err_out; } INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker); return cache; err_out: return NULL; } EXPORT_SYMBOL(mb_cache_create); /* * mb_cache_destroy - destroy cache * @cache: the cache to destroy * * Free all entries in cache and cache itself. Caller must make sure nobody * (except shrinker) can reach @cache when calling this. */ void mb_cache_destroy(struct mb_cache *cache) { struct mb_cache_entry *entry, *next; unregister_shrinker(&cache->c_shrink); /* * We don't bother with any locking. Cache must not be used at this * point. */ list_for_each_entry_safe(entry, next, &cache->c_list, e_list) { list_del(&entry->e_list); WARN_ON(atomic_read(&entry->e_refcnt) != 1); mb_cache_entry_put(cache, entry); } kfree(cache->c_hash); kfree(cache); } EXPORT_SYMBOL(mb_cache_destroy); static int __init mbcache_init(void) { mb_entry_cache = kmem_cache_create("mbcache", sizeof(struct mb_cache_entry), 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); if (!mb_entry_cache) return -ENOMEM; return 0; } static void __exit mbcache_exit(void) { kmem_cache_destroy(mb_entry_cache); } module_init(mbcache_init) module_exit(mbcache_exit) MODULE_AUTHOR("Jan Kara <jack@suse.cz>"); MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); MODULE_LICENSE("GPL"); |
2 25 2 24 4 76 60 23 21 2 2 2 424 425 7 2 4 4 140 142 21 21 425 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 | // SPDX-License-Identifier: GPL-2.0-only /* Event cache for netfilter. */ /* * (C) 2005 Harald Welte <laforge@gnumonks.org> * (C) 2005 Patrick McHardy <kaber@trash.net> * (C) 2005-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2005 USAGI/WIDE Project <http://www.linux-ipv6.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/netfilter.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/stddef.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/export.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_extend.h> static DEFINE_MUTEX(nf_ct_ecache_mutex); #define DYING_NULLS_VAL ((1 << 30) + 1) #define ECACHE_MAX_JIFFIES msecs_to_jiffies(10) #define ECACHE_RETRY_JIFFIES msecs_to_jiffies(10) enum retry_state { STATE_CONGESTED, STATE_RESTART, STATE_DONE, }; struct nf_conntrack_net_ecache *nf_conn_pernet_ecache(const struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); return &cnet->ecache; } #if IS_MODULE(CONFIG_NF_CT_NETLINK) EXPORT_SYMBOL_GPL(nf_conn_pernet_ecache); #endif static enum retry_state ecache_work_evict_list(struct nf_conntrack_net *cnet) { unsigned long stop = jiffies + ECACHE_MAX_JIFFIES; struct hlist_nulls_head evicted_list; enum retry_state ret = STATE_DONE; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; unsigned int sent; INIT_HLIST_NULLS_HEAD(&evicted_list, DYING_NULLS_VAL); next: sent = 0; spin_lock_bh(&cnet->ecache.dying_lock); hlist_nulls_for_each_entry_safe(h, n, &cnet->ecache.dying_list, hnnode) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); /* The worker owns all entries, ct remains valid until nf_ct_put * in the loop below. */ if (nf_conntrack_event(IPCT_DESTROY, ct)) { ret = STATE_CONGESTED; break; } hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &evicted_list); if (time_after(stop, jiffies)) { ret = STATE_RESTART; break; } if (sent++ > 16) { spin_unlock_bh(&cnet->ecache.dying_lock); cond_resched(); goto next; } } spin_unlock_bh(&cnet->ecache.dying_lock); hlist_nulls_for_each_entry_safe(h, n, &evicted_list, hnnode) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); nf_ct_put(ct); cond_resched(); } return ret; } static void ecache_work(struct work_struct *work) { struct nf_conntrack_net *cnet = container_of(work, struct nf_conntrack_net, ecache.dwork.work); int ret, delay = -1; ret = ecache_work_evict_list(cnet); switch (ret) { case STATE_CONGESTED: delay = ECACHE_RETRY_JIFFIES; break; case STATE_RESTART: delay = 0; break; case STATE_DONE: break; } if (delay >= 0) schedule_delayed_work(&cnet->ecache.dwork, delay); } static int __nf_conntrack_eventmask_report(struct nf_conntrack_ecache *e, const u32 events, const u32 missed, const struct nf_ct_event *item) { struct net *net = nf_ct_net(item->ct); struct nf_ct_event_notifier *notify; u32 old, want; int ret; if (!((events | missed) & e->ctmask)) return 0; rcu_read_lock(); notify = rcu_dereference(net->ct.nf_conntrack_event_cb); if (!notify) { rcu_read_unlock(); return 0; } ret = notify->ct_event(events | missed, item); rcu_read_unlock(); if (likely(ret >= 0 && missed == 0)) return 0; do { old = READ_ONCE(e->missed); if (ret < 0) want = old | events; else want = old & ~missed; } while (cmpxchg(&e->missed, old, want) != old); return ret; } int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct, u32 portid, int report) { struct nf_conntrack_ecache *e; struct nf_ct_event item; unsigned int missed; int ret; if (!nf_ct_is_confirmed(ct)) return 0; e = nf_ct_ecache_find(ct); if (!e) return 0; memset(&item, 0, sizeof(item)); item.ct = ct; item.portid = e->portid ? e->portid : portid; item.report = report; /* This is a resent of a destroy event? If so, skip missed */ missed = e->portid ? 0 : e->missed; ret = __nf_conntrack_eventmask_report(e, events, missed, &item); if (unlikely(ret < 0 && (events & (1 << IPCT_DESTROY)))) { /* This is a destroy event that has been triggered by a process, * we store the PORTID to include it in the retransmission. */ if (e->portid == 0 && portid != 0) e->portid = portid; } return ret; } EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report); /* deliver cached events and clear cache entry - must be called with locally * disabled softirqs */ void nf_ct_deliver_cached_events(struct nf_conn *ct) { struct nf_conntrack_ecache *e; struct nf_ct_event item; unsigned int events; if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct)) return; e = nf_ct_ecache_find(ct); if (e == NULL) return; events = xchg(&e->cache, 0); item.ct = ct; item.portid = 0; item.report = 0; /* We make a copy of the missed event cache without taking * the lock, thus we may send missed events twice. However, * this does not harm and it happens very rarely. */ __nf_conntrack_eventmask_report(e, events, e->missed, &item); } EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, struct nf_conntrack_expect *exp, u32 portid, int report) { struct net *net = nf_ct_exp_net(exp); struct nf_ct_event_notifier *notify; struct nf_conntrack_ecache *e; rcu_read_lock(); notify = rcu_dereference(net->ct.nf_conntrack_event_cb); if (!notify) goto out_unlock; e = nf_ct_ecache_find(exp->master); if (!e) goto out_unlock; if (e->expmask & (1 << event)) { struct nf_exp_event item = { .exp = exp, .portid = portid, .report = report }; notify->exp_event(1 << event, &item); } out_unlock: rcu_read_unlock(); } void nf_conntrack_register_notifier(struct net *net, const struct nf_ct_event_notifier *new) { struct nf_ct_event_notifier *notify; mutex_lock(&nf_ct_ecache_mutex); notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb, lockdep_is_held(&nf_ct_ecache_mutex)); WARN_ON_ONCE(notify); rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new); mutex_unlock(&nf_ct_ecache_mutex); } EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier); void nf_conntrack_unregister_notifier(struct net *net) { mutex_lock(&nf_ct_ecache_mutex); RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL); mutex_unlock(&nf_ct_ecache_mutex); /* synchronize_rcu() is called after netns pre_exit */ } EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); if (state == NFCT_ECACHE_DESTROY_FAIL && !delayed_work_pending(&cnet->ecache.dwork)) { schedule_delayed_work(&cnet->ecache.dwork, HZ); net->ct.ecache_dwork_pending = true; } else if (state == NFCT_ECACHE_DESTROY_SENT) { if (!hlist_nulls_empty(&cnet->ecache.dying_list)) mod_delayed_work(system_wq, &cnet->ecache.dwork, 0); else net->ct.ecache_dwork_pending = false; } } bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp) { struct net *net = nf_ct_net(ct); struct nf_conntrack_ecache *e; switch (net->ct.sysctl_events) { case 0: /* assignment via template / ruleset? ignore sysctl. */ if (ctmask || expmask) break; return true; case 2: /* autodetect: no event listener, don't allocate extension. */ if (!READ_ONCE(nf_ctnetlink_has_listener)) return true; fallthrough; case 1: /* always allocate an extension. */ if (!ctmask && !expmask) { ctmask = ~0; expmask = ~0; } break; default: WARN_ON_ONCE(1); return true; } e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp); if (e) { e->ctmask = ctmask; e->expmask = expmask; } return e != NULL; } EXPORT_SYMBOL_GPL(nf_ct_ecache_ext_add); #define NF_CT_EVENTS_DEFAULT 2 static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT; void nf_conntrack_ecache_pernet_init(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); net->ct.sysctl_events = nf_ct_events; INIT_DELAYED_WORK(&cnet->ecache.dwork, ecache_work); INIT_HLIST_NULLS_HEAD(&cnet->ecache.dying_list, DYING_NULLS_VAL); spin_lock_init(&cnet->ecache.dying_lock); BUILD_BUG_ON(__IPCT_MAX >= 16); /* e->ctmask is u16 */ } void nf_conntrack_ecache_pernet_fini(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); cancel_delayed_work_sync(&cnet->ecache.dwork); } |
12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2011 Instituto Nokia de Tecnologia * * Authors: * Lauro Ramos Venancio <lauro.venancio@openbossa.org> * Aloisio Almeida Jr <aloisio.almeida@openbossa.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__ #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/rfkill.h> #include <linux/nfc.h> #include <net/genetlink.h> #include "nfc.h" #define VERSION "0.1" #define NFC_CHECK_PRES_FREQ_MS 2000 int nfc_devlist_generation; DEFINE_MUTEX(nfc_devlist_mutex); /* NFC device ID bitmap */ static DEFINE_IDA(nfc_index_ida); int nfc_fw_download(struct nfc_dev *dev, const char *firmware_name) { int rc = 0; pr_debug("%s do firmware %s\n", dev_name(&dev->dev), firmware_name); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (dev->dev_up) { rc = -EBUSY; goto error; } if (!dev->ops->fw_download) { rc = -EOPNOTSUPP; goto error; } dev->fw_download_in_progress = true; rc = dev->ops->fw_download(dev, firmware_name); if (rc) dev->fw_download_in_progress = false; error: device_unlock(&dev->dev); return rc; } /** * nfc_fw_download_done - inform that a firmware download was completed * * @dev: The nfc device to which firmware was downloaded * @firmware_name: The firmware filename * @result: The positive value of a standard errno value */ int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name, u32 result) { dev->fw_download_in_progress = false; return nfc_genl_fw_download_done(dev, firmware_name, result); } EXPORT_SYMBOL(nfc_fw_download_done); /** * nfc_dev_up - turn on the NFC device * * @dev: The nfc device to be turned on * * The device remains up until the nfc_dev_down function is called. */ int nfc_dev_up(struct nfc_dev *dev) { int rc = 0; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (dev->rfkill && rfkill_blocked(dev->rfkill)) { rc = -ERFKILL; goto error; } if (dev->fw_download_in_progress) { rc = -EBUSY; goto error; } if (dev->dev_up) { rc = -EALREADY; goto error; } if (dev->ops->dev_up) rc = dev->ops->dev_up(dev); if (!rc) dev->dev_up = true; /* We have to enable the device before discovering SEs */ if (dev->ops->discover_se && dev->ops->discover_se(dev)) pr_err("SE discovery failed\n"); error: device_unlock(&dev->dev); return rc; } /** * nfc_dev_down - turn off the NFC device * * @dev: The nfc device to be turned off */ int nfc_dev_down(struct nfc_dev *dev) { int rc = 0; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (!dev->dev_up) { rc = -EALREADY; goto error; } if (dev->polling || dev->active_target) { rc = -EBUSY; goto error; } if (dev->ops->dev_down) dev->ops->dev_down(dev); dev->dev_up = false; error: device_unlock(&dev->dev); return rc; } static int nfc_rfkill_set_block(void *data, bool blocked) { struct nfc_dev *dev = data; pr_debug("%s blocked %d", dev_name(&dev->dev), blocked); if (!blocked) return 0; nfc_dev_down(dev); return 0; } static const struct rfkill_ops nfc_rfkill_ops = { .set_block = nfc_rfkill_set_block, }; /** * nfc_start_poll - start polling for nfc targets * * @dev: The nfc device that must start polling * @im_protocols: bitset of nfc initiator protocols to be used for polling * @tm_protocols: bitset of nfc transport protocols to be used for polling * * The device remains polling for targets until a target is found or * the nfc_stop_poll function is called. */ int nfc_start_poll(struct nfc_dev *dev, u32 im_protocols, u32 tm_protocols) { int rc; pr_debug("dev_name %s initiator protocols 0x%x target protocols 0x%x\n", dev_name(&dev->dev), im_protocols, tm_protocols); if (!im_protocols && !tm_protocols) return -EINVAL; device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (!dev->dev_up) { rc = -ENODEV; goto error; } if (dev->polling) { rc = -EBUSY; goto error; } rc = dev->ops->start_poll(dev, im_protocols, tm_protocols); if (!rc) { dev->polling = true; dev->rf_mode = NFC_RF_NONE; } error: device_unlock(&dev->dev); return rc; } /** * nfc_stop_poll - stop polling for nfc targets * * @dev: The nfc device that must stop polling */ int nfc_stop_poll(struct nfc_dev *dev) { int rc = 0; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (!dev->polling) { rc = -EINVAL; goto error; } dev->ops->stop_poll(dev); dev->polling = false; dev->rf_mode = NFC_RF_NONE; error: device_unlock(&dev->dev); return rc; } static struct nfc_target *nfc_find_target(struct nfc_dev *dev, u32 target_idx) { int i; for (i = 0; i < dev->n_targets; i++) { if (dev->targets[i].idx == target_idx) return &dev->targets[i]; } return NULL; } int nfc_dep_link_up(struct nfc_dev *dev, int target_index, u8 comm_mode) { int rc = 0; u8 *gb; size_t gb_len; struct nfc_target *target; pr_debug("dev_name=%s comm %d\n", dev_name(&dev->dev), comm_mode); if (!dev->ops->dep_link_up) return -EOPNOTSUPP; device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (dev->dep_link_up == true) { rc = -EALREADY; goto error; } gb = nfc_llcp_general_bytes(dev, &gb_len); if (gb_len > NFC_MAX_GT_LEN) { rc = -EINVAL; goto error; } target = nfc_find_target(dev, target_index); if (target == NULL) { rc = -ENOTCONN; goto error; } rc = dev->ops->dep_link_up(dev, target, comm_mode, gb, gb_len); if (!rc) { dev->active_target = target; dev->rf_mode = NFC_RF_INITIATOR; } error: device_unlock(&dev->dev); return rc; } int nfc_dep_link_down(struct nfc_dev *dev) { int rc = 0; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); if (!dev->ops->dep_link_down) return -EOPNOTSUPP; device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (dev->dep_link_up == false) { rc = -EALREADY; goto error; } rc = dev->ops->dep_link_down(dev); if (!rc) { dev->dep_link_up = false; dev->active_target = NULL; dev->rf_mode = NFC_RF_NONE; nfc_llcp_mac_is_down(dev); nfc_genl_dep_link_down_event(dev); } error: device_unlock(&dev->dev); return rc; } int nfc_dep_link_is_up(struct nfc_dev *dev, u32 target_idx, u8 comm_mode, u8 rf_mode) { dev->dep_link_up = true; if (!dev->active_target && rf_mode == NFC_RF_INITIATOR) { struct nfc_target *target; target = nfc_find_target(dev, target_idx); if (target == NULL) return -ENOTCONN; dev->active_target = target; } dev->polling = false; dev->rf_mode = rf_mode; nfc_llcp_mac_is_up(dev, target_idx, comm_mode, rf_mode); return nfc_genl_dep_link_up_event(dev, target_idx, comm_mode, rf_mode); } EXPORT_SYMBOL(nfc_dep_link_is_up); /** * nfc_activate_target - prepare the target for data exchange * * @dev: The nfc device that found the target * @target_idx: index of the target that must be activated * @protocol: nfc protocol that will be used for data exchange */ int nfc_activate_target(struct nfc_dev *dev, u32 target_idx, u32 protocol) { int rc; struct nfc_target *target; pr_debug("dev_name=%s target_idx=%u protocol=%u\n", dev_name(&dev->dev), target_idx, protocol); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (dev->active_target) { rc = -EBUSY; goto error; } target = nfc_find_target(dev, target_idx); if (target == NULL) { rc = -ENOTCONN; goto error; } rc = dev->ops->activate_target(dev, target, protocol); if (!rc) { dev->active_target = target; dev->rf_mode = NFC_RF_INITIATOR; if (dev->ops->check_presence && !dev->shutting_down) mod_timer(&dev->check_pres_timer, jiffies + msecs_to_jiffies(NFC_CHECK_PRES_FREQ_MS)); } error: device_unlock(&dev->dev); return rc; } /** * nfc_deactivate_target - deactivate a nfc target * * @dev: The nfc device that found the target * @target_idx: index of the target that must be deactivated * @mode: idle or sleep? */ int nfc_deactivate_target(struct nfc_dev *dev, u32 target_idx, u8 mode) { int rc = 0; pr_debug("dev_name=%s target_idx=%u\n", dev_name(&dev->dev), target_idx); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (dev->active_target == NULL) { rc = -ENOTCONN; goto error; } if (dev->active_target->idx != target_idx) { rc = -ENOTCONN; goto error; } if (dev->ops->check_presence) del_timer_sync(&dev->check_pres_timer); dev->ops->deactivate_target(dev, dev->active_target, mode); dev->active_target = NULL; error: device_unlock(&dev->dev); return rc; } /** * nfc_data_exchange - transceive data * * @dev: The nfc device that found the target * @target_idx: index of the target * @skb: data to be sent * @cb: callback called when the response is received * @cb_context: parameter for the callback function * * The user must wait for the callback before calling this function again. */ int nfc_data_exchange(struct nfc_dev *dev, u32 target_idx, struct sk_buff *skb, data_exchange_cb_t cb, void *cb_context) { int rc; pr_debug("dev_name=%s target_idx=%u skb->len=%u\n", dev_name(&dev->dev), target_idx, skb->len); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; kfree_skb(skb); goto error; } if (dev->rf_mode == NFC_RF_INITIATOR && dev->active_target != NULL) { if (dev->active_target->idx != target_idx) { rc = -EADDRNOTAVAIL; kfree_skb(skb); goto error; } if (dev->ops->check_presence) del_timer_sync(&dev->check_pres_timer); rc = dev->ops->im_transceive(dev, dev->active_target, skb, cb, cb_context); if (!rc && dev->ops->check_presence && !dev->shutting_down) mod_timer(&dev->check_pres_timer, jiffies + msecs_to_jiffies(NFC_CHECK_PRES_FREQ_MS)); } else if (dev->rf_mode == NFC_RF_TARGET && dev->ops->tm_send != NULL) { rc = dev->ops->tm_send(dev, skb); } else { rc = -ENOTCONN; kfree_skb(skb); goto error; } error: device_unlock(&dev->dev); return rc; } struct nfc_se *nfc_find_se(struct nfc_dev *dev, u32 se_idx) { struct nfc_se *se; list_for_each_entry(se, &dev->secure_elements, list) if (se->idx == se_idx) return se; return NULL; } EXPORT_SYMBOL(nfc_find_se); int nfc_enable_se(struct nfc_dev *dev, u32 se_idx) { struct nfc_se *se; int rc; pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (!dev->dev_up) { rc = -ENODEV; goto error; } if (dev->polling) { rc = -EBUSY; goto error; } if (!dev->ops->enable_se || !dev->ops->disable_se) { rc = -EOPNOTSUPP; goto error; } se = nfc_find_se(dev, se_idx); if (!se) { rc = -EINVAL; goto error; } if (se->state == NFC_SE_ENABLED) { rc = -EALREADY; goto error; } rc = dev->ops->enable_se(dev, se_idx); if (rc >= 0) se->state = NFC_SE_ENABLED; error: device_unlock(&dev->dev); return rc; } int nfc_disable_se(struct nfc_dev *dev, u32 se_idx) { struct nfc_se *se; int rc; pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (!dev->dev_up) { rc = -ENODEV; goto error; } if (!dev->ops->enable_se || !dev->ops->disable_se) { rc = -EOPNOTSUPP; goto error; } se = nfc_find_se(dev, se_idx); if (!se) { rc = -EINVAL; goto error; } if (se->state == NFC_SE_DISABLED) { rc = -EALREADY; goto error; } rc = dev->ops->disable_se(dev, se_idx); if (rc >= 0) se->state = NFC_SE_DISABLED; error: device_unlock(&dev->dev); return rc; } int nfc_set_remote_general_bytes(struct nfc_dev *dev, const u8 *gb, u8 gb_len) { pr_debug("dev_name=%s gb_len=%d\n", dev_name(&dev->dev), gb_len); return nfc_llcp_set_remote_gb(dev, gb, gb_len); } EXPORT_SYMBOL(nfc_set_remote_general_bytes); u8 *nfc_get_local_general_bytes(struct nfc_dev *dev, size_t *gb_len) { pr_debug("dev_name=%s\n", dev_name(&dev->dev)); return nfc_llcp_general_bytes(dev, gb_len); } EXPORT_SYMBOL(nfc_get_local_general_bytes); int nfc_tm_data_received(struct nfc_dev *dev, struct sk_buff *skb) { /* Only LLCP target mode for now */ if (dev->dep_link_up == false) { kfree_skb(skb); return -ENOLINK; } return nfc_llcp_data_received(dev, skb); } EXPORT_SYMBOL(nfc_tm_data_received); int nfc_tm_activated(struct nfc_dev *dev, u32 protocol, u8 comm_mode, const u8 *gb, size_t gb_len) { int rc; device_lock(&dev->dev); dev->polling = false; if (gb != NULL) { rc = nfc_set_remote_general_bytes(dev, gb, gb_len); if (rc < 0) goto out; } dev->rf_mode = NFC_RF_TARGET; if (protocol == NFC_PROTO_NFC_DEP_MASK) nfc_dep_link_is_up(dev, 0, comm_mode, NFC_RF_TARGET); rc = nfc_genl_tm_activated(dev, protocol); out: device_unlock(&dev->dev); return rc; } EXPORT_SYMBOL(nfc_tm_activated); int nfc_tm_deactivated(struct nfc_dev *dev) { dev->dep_link_up = false; dev->rf_mode = NFC_RF_NONE; return nfc_genl_tm_deactivated(dev); } EXPORT_SYMBOL(nfc_tm_deactivated); /** * nfc_alloc_send_skb - allocate a skb for data exchange responses * * @dev: device sending the response * @sk: socket sending the response * @flags: MSG_DONTWAIT flag * @size: size to allocate * @err: pointer to memory to store the error code */ struct sk_buff *nfc_alloc_send_skb(struct nfc_dev *dev, struct sock *sk, unsigned int flags, unsigned int size, unsigned int *err) { struct sk_buff *skb; unsigned int total_size; total_size = size + dev->tx_headroom + dev->tx_tailroom + NFC_HEADER_SIZE; skb = sock_alloc_send_skb(sk, total_size, flags & MSG_DONTWAIT, err); if (skb) skb_reserve(skb, dev->tx_headroom + NFC_HEADER_SIZE); return skb; } /** * nfc_alloc_recv_skb - allocate a skb for data exchange responses * * @size: size to allocate * @gfp: gfp flags */ struct sk_buff *nfc_alloc_recv_skb(unsigned int size, gfp_t gfp) { struct sk_buff *skb; unsigned int total_size; total_size = size + 1; skb = alloc_skb(total_size, gfp); if (skb) skb_reserve(skb, 1); return skb; } EXPORT_SYMBOL(nfc_alloc_recv_skb); /** * nfc_targets_found - inform that targets were found * * @dev: The nfc device that found the targets * @targets: array of nfc targets found * @n_targets: targets array size * * The device driver must call this function when one or many nfc targets * are found. After calling this function, the device driver must stop * polling for targets. * NOTE: This function can be called with targets=NULL and n_targets=0 to * notify a driver error, meaning that the polling operation cannot complete. * IMPORTANT: this function must not be called from an atomic context. * In addition, it must also not be called from a context that would prevent * the NFC Core to call other nfc ops entry point concurrently. */ int nfc_targets_found(struct nfc_dev *dev, struct nfc_target *targets, int n_targets) { int i; pr_debug("dev_name=%s n_targets=%d\n", dev_name(&dev->dev), n_targets); for (i = 0; i < n_targets; i++) targets[i].idx = dev->target_next_idx++; device_lock(&dev->dev); if (dev->polling == false) { device_unlock(&dev->dev); return 0; } dev->polling = false; dev->targets_generation++; kfree(dev->targets); dev->targets = NULL; if (targets) { dev->targets = kmemdup(targets, n_targets * sizeof(struct nfc_target), GFP_ATOMIC); if (!dev->targets) { dev->n_targets = 0; device_unlock(&dev->dev); return -ENOMEM; } } dev->n_targets = n_targets; device_unlock(&dev->dev); nfc_genl_targets_found(dev); return 0; } EXPORT_SYMBOL(nfc_targets_found); /** * nfc_target_lost - inform that an activated target went out of field * * @dev: The nfc device that had the activated target in field * @target_idx: the nfc index of the target * * The device driver must call this function when the activated target * goes out of the field. * IMPORTANT: this function must not be called from an atomic context. * In addition, it must also not be called from a context that would prevent * the NFC Core to call other nfc ops entry point concurrently. */ int nfc_target_lost(struct nfc_dev *dev, u32 target_idx) { const struct nfc_target *tg; int i; pr_debug("dev_name %s n_target %d\n", dev_name(&dev->dev), target_idx); device_lock(&dev->dev); for (i = 0; i < dev->n_targets; i++) { tg = &dev->targets[i]; if (tg->idx == target_idx) break; } if (i == dev->n_targets) { device_unlock(&dev->dev); return -EINVAL; } dev->targets_generation++; dev->n_targets--; dev->active_target = NULL; if (dev->n_targets) { memcpy(&dev->targets[i], &dev->targets[i + 1], (dev->n_targets - i) * sizeof(struct nfc_target)); } else { kfree(dev->targets); dev->targets = NULL; } device_unlock(&dev->dev); nfc_genl_target_lost(dev, target_idx); return 0; } EXPORT_SYMBOL(nfc_target_lost); inline void nfc_driver_failure(struct nfc_dev *dev, int err) { nfc_targets_found(dev, NULL, 0); } EXPORT_SYMBOL(nfc_driver_failure); int nfc_add_se(struct nfc_dev *dev, u32 se_idx, u16 type) { struct nfc_se *se; int rc; pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx); se = nfc_find_se(dev, se_idx); if (se) return -EALREADY; se = kzalloc(sizeof(struct nfc_se), GFP_KERNEL); if (!se) return -ENOMEM; se->idx = se_idx; se->type = type; se->state = NFC_SE_DISABLED; INIT_LIST_HEAD(&se->list); list_add(&se->list, &dev->secure_elements); rc = nfc_genl_se_added(dev, se_idx, type); if (rc < 0) { list_del(&se->list); kfree(se); return rc; } return 0; } EXPORT_SYMBOL(nfc_add_se); int nfc_remove_se(struct nfc_dev *dev, u32 se_idx) { struct nfc_se *se, *n; int rc; pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx); list_for_each_entry_safe(se, n, &dev->secure_elements, list) if (se->idx == se_idx) { rc = nfc_genl_se_removed(dev, se_idx); if (rc < 0) return rc; list_del(&se->list); kfree(se); return 0; } return -EINVAL; } EXPORT_SYMBOL(nfc_remove_se); int nfc_se_transaction(struct nfc_dev *dev, u8 se_idx, struct nfc_evt_transaction *evt_transaction) { int rc; pr_debug("transaction: %x\n", se_idx); device_lock(&dev->dev); if (!evt_transaction) { rc = -EPROTO; goto out; } rc = nfc_genl_se_transaction(dev, se_idx, evt_transaction); out: device_unlock(&dev->dev); return rc; } EXPORT_SYMBOL(nfc_se_transaction); int nfc_se_connectivity(struct nfc_dev *dev, u8 se_idx) { int rc; pr_debug("connectivity: %x\n", se_idx); device_lock(&dev->dev); rc = nfc_genl_se_connectivity(dev, se_idx); device_unlock(&dev->dev); return rc; } EXPORT_SYMBOL(nfc_se_connectivity); static void nfc_release(struct device *d) { struct nfc_dev *dev = to_nfc_dev(d); struct nfc_se *se, *n; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); nfc_genl_data_exit(&dev->genl_data); kfree(dev->targets); list_for_each_entry_safe(se, n, &dev->secure_elements, list) { nfc_genl_se_removed(dev, se->idx); list_del(&se->list); kfree(se); } ida_free(&nfc_index_ida, dev->idx); kfree(dev); } static void nfc_check_pres_work(struct work_struct *work) { struct nfc_dev *dev = container_of(work, struct nfc_dev, check_pres_work); int rc; device_lock(&dev->dev); if (dev->active_target && timer_pending(&dev->check_pres_timer) == 0) { rc = dev->ops->check_presence(dev, dev->active_target); if (rc == -EOPNOTSUPP) goto exit; if (rc) { u32 active_target_idx = dev->active_target->idx; device_unlock(&dev->dev); nfc_target_lost(dev, active_target_idx); return; } if (!dev->shutting_down) mod_timer(&dev->check_pres_timer, jiffies + msecs_to_jiffies(NFC_CHECK_PRES_FREQ_MS)); } exit: device_unlock(&dev->dev); } static void nfc_check_pres_timeout(struct timer_list *t) { struct nfc_dev *dev = from_timer(dev, t, check_pres_timer); schedule_work(&dev->check_pres_work); } struct class nfc_class = { .name = "nfc", .dev_release = nfc_release, }; EXPORT_SYMBOL(nfc_class); static int match_idx(struct device *d, const void *data) { struct nfc_dev *dev = to_nfc_dev(d); const unsigned int *idx = data; return dev->idx == *idx; } struct nfc_dev *nfc_get_device(unsigned int idx) { struct device *d; d = class_find_device(&nfc_class, NULL, &idx, match_idx); if (!d) return NULL; return to_nfc_dev(d); } /** * nfc_allocate_device - allocate a new nfc device * * @ops: device operations * @supported_protocols: NFC protocols supported by the device * @tx_headroom: reserved space at beginning of skb * @tx_tailroom: reserved space at end of skb */ struct nfc_dev *nfc_allocate_device(const struct nfc_ops *ops, u32 supported_protocols, int tx_headroom, int tx_tailroom) { struct nfc_dev *dev; int rc; if (!ops->start_poll || !ops->stop_poll || !ops->activate_target || !ops->deactivate_target || !ops->im_transceive) return NULL; if (!supported_protocols) return NULL; dev = kzalloc(sizeof(struct nfc_dev), GFP_KERNEL); if (!dev) return NULL; rc = ida_alloc(&nfc_index_ida, GFP_KERNEL); if (rc < 0) goto err_free_dev; dev->idx = rc; dev->dev.class = &nfc_class; dev_set_name(&dev->dev, "nfc%d", dev->idx); device_initialize(&dev->dev); dev->ops = ops; dev->supported_protocols = supported_protocols; dev->tx_headroom = tx_headroom; dev->tx_tailroom = tx_tailroom; INIT_LIST_HEAD(&dev->secure_elements); nfc_genl_data_init(&dev->genl_data); dev->rf_mode = NFC_RF_NONE; /* first generation must not be 0 */ dev->targets_generation = 1; if (ops->check_presence) { timer_setup(&dev->check_pres_timer, nfc_check_pres_timeout, 0); INIT_WORK(&dev->check_pres_work, nfc_check_pres_work); } return dev; err_free_dev: kfree(dev); return NULL; } EXPORT_SYMBOL(nfc_allocate_device); /** * nfc_register_device - register a nfc device in the nfc subsystem * * @dev: The nfc device to register */ int nfc_register_device(struct nfc_dev *dev) { int rc; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); mutex_lock(&nfc_devlist_mutex); nfc_devlist_generation++; rc = device_add(&dev->dev); mutex_unlock(&nfc_devlist_mutex); if (rc < 0) return rc; rc = nfc_llcp_register_device(dev); if (rc) pr_err("Could not register llcp device\n"); device_lock(&dev->dev); dev->rfkill = rfkill_alloc(dev_name(&dev->dev), &dev->dev, RFKILL_TYPE_NFC, &nfc_rfkill_ops, dev); if (dev->rfkill) { if (rfkill_register(dev->rfkill) < 0) { rfkill_destroy(dev->rfkill); dev->rfkill = NULL; } } dev->shutting_down = false; device_unlock(&dev->dev); rc = nfc_genl_device_added(dev); if (rc) pr_debug("The userspace won't be notified that the device %s was added\n", dev_name(&dev->dev)); return 0; } EXPORT_SYMBOL(nfc_register_device); /** * nfc_unregister_device - unregister a nfc device in the nfc subsystem * * @dev: The nfc device to unregister */ void nfc_unregister_device(struct nfc_dev *dev) { int rc; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); rc = nfc_genl_device_removed(dev); if (rc) pr_debug("The userspace won't be notified that the device %s " "was removed\n", dev_name(&dev->dev)); device_lock(&dev->dev); if (dev->rfkill) { rfkill_unregister(dev->rfkill); rfkill_destroy(dev->rfkill); dev->rfkill = NULL; } dev->shutting_down = true; device_unlock(&dev->dev); if (dev->ops->check_presence) { del_timer_sync(&dev->check_pres_timer); cancel_work_sync(&dev->check_pres_work); } nfc_llcp_unregister_device(dev); mutex_lock(&nfc_devlist_mutex); nfc_devlist_generation++; device_del(&dev->dev); mutex_unlock(&nfc_devlist_mutex); } EXPORT_SYMBOL(nfc_unregister_device); static int __init nfc_init(void) { int rc; pr_info("NFC Core ver %s\n", VERSION); rc = class_register(&nfc_class); if (rc) return rc; rc = nfc_genl_init(); if (rc) goto err_genl; /* the first generation must not be 0 */ nfc_devlist_generation = 1; rc = rawsock_init(); if (rc) goto err_rawsock; rc = nfc_llcp_init(); if (rc) goto err_llcp_sock; rc = af_nfc_init(); if (rc) goto err_af_nfc; return 0; err_af_nfc: nfc_llcp_exit(); err_llcp_sock: rawsock_exit(); err_rawsock: nfc_genl_exit(); err_genl: class_unregister(&nfc_class); return rc; } static void __exit nfc_exit(void) { af_nfc_exit(); nfc_llcp_exit(); rawsock_exit(); nfc_genl_exit(); class_unregister(&nfc_class); } subsys_initcall(nfc_init); module_exit(nfc_exit); MODULE_AUTHOR("Lauro Ramos Venancio <lauro.venancio@openbossa.org>"); MODULE_DESCRIPTION("NFC Core ver " VERSION); MODULE_VERSION(VERSION); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_NFC); MODULE_ALIAS_GENL_FAMILY(NFC_GENL_NAME); |